Blame - mm/ksm.c - kernel/msm-4.9

blob: 2c02094807e0bd11abd98639bdd6691f492425ab [file] [log] [blame]

Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
				7	* Copyright (C) 2008 Red Hat, Inc.
				8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
				12	*
				13	* This work is licensed under the terms of the GNU GPL, version 2.
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	14	*/
				15
				16	#include <linux/errno.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	17	#include <linux/mm.h>
				18	#include <linux/fs.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	19	#include <linux/mman.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	20	#include <linux/sched.h>
				21	#include <linux/rwsem.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/rmap.h>
				24	#include <linux/spinlock.h>
				25	#include <linux/jhash.h>
				26	#include <linux/delay.h>
				27	#include <linux/kthread.h>
				28	#include <linux/wait.h>
				29	#include <linux/slab.h>
				30	#include <linux/rbtree.h>
				31	#include <linux/mmu_notifier.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	32	#include <linux/ksm.h>
				33
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	34	#include <asm/tlbflush.h>
				35
				36	/*
				37	* A few notes about the KSM scanning process,
				38	* to make it easier to understand the data structures below:
				39	*
				40	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				41	* contents into a data structure that holds pointers to the pages' locations.
				42	*
				43	* Since the contents of the pages may change at any moment, KSM cannot just
				44	* insert the pages into a normal sorted tree and expect it to find anything.
				45	* Therefore KSM uses two data structures - the stable and the unstable tree.
				46	*
				47	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				48	* by their contents. Because each such page is write-protected, searching on
				49	* this tree is fully assured to be working (except when pages are unmapped),
				50	* and therefore this tree is called the stable tree.
				51	*
				52	* In addition to the stable tree, KSM uses a second data structure called the
				53	* unstable tree: this tree holds pointers to pages which have been found to
				54	* be "unchanged for a period of time". The unstable tree sorts these pages
				55	* by their contents, but since they are not write-protected, KSM cannot rely
				56	* upon the unstable tree to work correctly - the unstable tree is liable to
				57	* be corrupted as its contents are modified, and so it is called unstable.
				58	*
				59	* KSM solves this problem by several techniques:
				60	*
				61	* 1) The unstable tree is flushed every time KSM completes scanning all
				62	* memory areas, and then the tree is rebuilt again from the beginning.
				63	* 2) KSM will only insert into the unstable tree, pages whose hash value
				64	* has not changed since the previous scan of all memory areas.
				65	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				66	* colors of the nodes and not on their contents, assuring that even when
				67	* the tree gets "corrupted" it won't get out of balance, so scanning time
				68	* remains the same (also, searching and inserting nodes in an rbtree uses
				69	* the same algorithm, so we have no overhead when we flush and rebuild).
				70	* 4) KSM never flushes the stable tree, which means that even if it were to
				71	* take 10 attempts to find a page in the unstable tree, once it is found,
				72	* it is secured in the stable tree. (When we scan a new page, we first
				73	* compare it against the stable tree, and then against the unstable tree.)
				74	*/
				75
				76	/**
				77	* struct mm_slot - ksm information per mm that is being scanned
				78	* @link: link to the mm_slots hash list
				79	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
				80	* @rmap_list: head for this mm_slot's list of rmap_items
				81	* @mm: the mm that this information is valid for
				82	*/
				83	struct mm_slot {
				84	struct hlist_node link;
				85	struct list_head mm_list;
				86	struct list_head rmap_list;
				87	struct mm_struct *mm;
				88	};
				89
				90	/**
				91	* struct ksm_scan - cursor for scanning
				92	* @mm_slot: the current mm_slot we are scanning
				93	* @address: the next address inside that to be scanned
				94	* @rmap_item: the current rmap that we are scanning inside the rmap_list
				95	* @seqnr: count of completed full scans (needed when removing unstable node)
				96	*
				97	* There is only the one ksm_scan instance of this cursor structure.
				98	*/
				99	struct ksm_scan {
				100	struct mm_slot *mm_slot;
				101	unsigned long address;
				102	struct rmap_item *rmap_item;
				103	unsigned long seqnr;
				104	};
				105
				106	/**
				107	* struct rmap_item - reverse mapping item for virtual addresses
				108	* @link: link into mm_slot's rmap_list (rmap_list is per mm)
				109	* @mm: the memory structure this rmap_item is pointing into
				110	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				111	* @oldchecksum: previous checksum of the page at that virtual address
				112	* @node: rb_node of this rmap_item in either unstable or stable tree
				113	* @next: next rmap_item hanging off the same node of the stable tree
				114	* @prev: previous rmap_item hanging off the same node of the stable tree
				115	*/
				116	struct rmap_item {
				117	struct list_head link;
				118	struct mm_struct *mm;
				119	unsigned long address; /* + low bits used for flags below */
				120	union {
				121	unsigned int oldchecksum; /* when unstable */
				122	struct rmap_item next; / when stable */
				123	};
				124	union {
				125	struct rb_node node; /* when tree node */
				126	struct rmap_item prev; / in stable list */
				127	};
				128	};
				129
				130	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
				131	#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
				132	#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
				133
				134	/* The stable and unstable tree heads */
				135	static struct rb_root root_stable_tree = RB_ROOT;
				136	static struct rb_root root_unstable_tree = RB_ROOT;
				137
				138	#define MM_SLOTS_HASH_HEADS 1024
				139	static struct hlist_head *mm_slots_hash;
				140
				141	static struct mm_slot ksm_mm_head = {
				142	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				143	};
				144	static struct ksm_scan ksm_scan = {
				145	.mm_slot = &ksm_mm_head,
				146	};
				147
				148	static struct kmem_cache *rmap_item_cache;
				149	static struct kmem_cache *mm_slot_cache;
				150
				151	/* The number of nodes in the stable tree */
				152	static unsigned long ksm_kernel_pages_allocated;
				153
				154	/* The number of page slots sharing those nodes */
				155	static unsigned long ksm_pages_shared;
				156
				157	/* Limit on the number of unswappable pages used */
				158	static unsigned long ksm_max_kernel_pages;
				159
				160	/* Number of pages ksmd should scan in one batch */
				161	static unsigned int ksm_thread_pages_to_scan;
				162
				163	/* Milliseconds ksmd should sleep between batches */
				164	static unsigned int ksm_thread_sleep_millisecs;
				165
				166	#define KSM_RUN_STOP 0
				167	#define KSM_RUN_MERGE 1
				168	#define KSM_RUN_UNMERGE 2
				169	static unsigned int ksm_run;
				170
				171	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				172	static DEFINE_MUTEX(ksm_thread_mutex);
				173	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				174
				175	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				176	sizeof(struct __struct), __alignof__(struct __struct),\
				177	(__flags), NULL)
				178
				179	static int __init ksm_slab_init(void)
				180	{
				181	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				182	if (!rmap_item_cache)
				183	goto out;
				184
				185	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				186	if (!mm_slot_cache)
				187	goto out_free;
				188
				189	return 0;
				190
				191	out_free:
				192	kmem_cache_destroy(rmap_item_cache);
				193	out:
				194	return -ENOMEM;
				195	}
				196
				197	static void __init ksm_slab_free(void)
				198	{
				199	kmem_cache_destroy(mm_slot_cache);
				200	kmem_cache_destroy(rmap_item_cache);
				201	mm_slot_cache = NULL;
				202	}
				203
				204	static inline struct rmap_item *alloc_rmap_item(void)
				205	{
				206	return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
				207	}
				208
				209	static inline void free_rmap_item(struct rmap_item *rmap_item)
				210	{
				211	rmap_item->mm = NULL; /* debug safety */
				212	kmem_cache_free(rmap_item_cache, rmap_item);
				213	}
				214
				215	static inline struct mm_slot *alloc_mm_slot(void)
				216	{
				217	if (!mm_slot_cache) /* initialization failed */
				218	return NULL;
				219	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				220	}
				221
				222	static inline void free_mm_slot(struct mm_slot *mm_slot)
				223	{
				224	kmem_cache_free(mm_slot_cache, mm_slot);
				225	}
				226
				227	static int __init mm_slots_hash_init(void)
				228	{
				229	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
				230	GFP_KERNEL);
				231	if (!mm_slots_hash)
				232	return -ENOMEM;
				233	return 0;
				234	}
				235
				236	static void __init mm_slots_hash_free(void)
				237	{
				238	kfree(mm_slots_hash);
				239	}
				240
				241	static struct mm_slot get_mm_slot(struct mm_struct mm)
				242	{
				243	struct mm_slot *mm_slot;
				244	struct hlist_head *bucket;
				245	struct hlist_node *node;
				246
				247	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				248	% MM_SLOTS_HASH_HEADS];
				249	hlist_for_each_entry(mm_slot, node, bucket, link) {
				250	if (mm == mm_slot->mm)
				251	return mm_slot;
				252	}
				253	return NULL;
				254	}
				255
				256	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				257	struct mm_slot *mm_slot)
				258	{
				259	struct hlist_head *bucket;
				260
				261	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				262	% MM_SLOTS_HASH_HEADS];
				263	mm_slot->mm = mm;
				264	INIT_LIST_HEAD(&mm_slot->rmap_list);
				265	hlist_add_head(&mm_slot->link, bucket);
				266	}
				267
				268	static inline int in_stable_tree(struct rmap_item *rmap_item)
				269	{
				270	return rmap_item->address & STABLE_FLAG;
				271	}
				272
				273	/*
				274	* We use break_ksm to break COW on a ksm page: it's a stripped down
				275	*
				276	* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
				277	* put_page(page);
				278	*
				279	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				280	* in case the application has unmapped and remapped mm,addr meanwhile.
				281	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				282	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
				283	*/
				284	static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
				285	{
				286	struct page *page;
				287	int ret;
				288
				289	do {
				290	cond_resched();
				291	page = follow_page(vma, addr, FOLL_GET);
				292	if (!page)
				293	break;
				294	if (PageKsm(page))
				295	ret = handle_mm_fault(vma->vm_mm, vma, addr,
				296	FAULT_FLAG_WRITE);
				297	else
				298	ret = VM_FAULT_WRITE;
				299	put_page(page);
				300	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS)));
				301
				302	/* Which leaves us looping there if VM_FAULT_OOM: hmmm... */
				303	}
				304
				305	static void __break_cow(struct mm_struct *mm, unsigned long addr)
				306	{
				307	struct vm_area_struct *vma;
				308
				309	vma = find_vma(mm, addr);
				310	if (!vma \|\| vma->vm_start > addr)
				311	return;
				312	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				313	return;
				314	break_ksm(vma, addr);
				315	}
				316
				317	static void break_cow(struct mm_struct *mm, unsigned long addr)
				318	{
				319	down_read(&mm->mmap_sem);
				320	__break_cow(mm, addr);
				321	up_read(&mm->mmap_sem);
				322	}
				323
				324	static struct page get_mergeable_page(struct rmap_item rmap_item)
				325	{
				326	struct mm_struct *mm = rmap_item->mm;
				327	unsigned long addr = rmap_item->address;
				328	struct vm_area_struct *vma;
				329	struct page *page;
				330
				331	down_read(&mm->mmap_sem);
				332	vma = find_vma(mm, addr);
				333	if (!vma \|\| vma->vm_start > addr)
				334	goto out;
				335	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				336	goto out;
				337
				338	page = follow_page(vma, addr, FOLL_GET);
				339	if (!page)
				340	goto out;
				341	if (PageAnon(page)) {
				342	flush_anon_page(vma, page, addr);
				343	flush_dcache_page(page);
				344	} else {
				345	put_page(page);
				346	out: page = NULL;
				347	}
				348	up_read(&mm->mmap_sem);
				349	return page;
				350	}
				351
				352	/*
				353	* get_ksm_page: checks if the page at the virtual address in rmap_item
				354	* is still PageKsm, in which case we can trust the content of the page,
				355	* and it returns the gotten page; but NULL if the page has been zapped.
				356	*/
				357	static struct page get_ksm_page(struct rmap_item rmap_item)
				358	{
				359	struct page *page;
				360
				361	page = get_mergeable_page(rmap_item);
				362	if (page && !PageKsm(page)) {
				363	put_page(page);
				364	page = NULL;
				365	}
				366	return page;
				367	}
				368
				369	/*
				370	* Removing rmap_item from stable or unstable tree.
				371	* This function will clean the information from the stable/unstable tree.
				372	*/
				373	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				374	{
				375	if (in_stable_tree(rmap_item)) {
				376	struct rmap_item *next_item = rmap_item->next;
				377
				378	if (rmap_item->address & NODE_FLAG) {
				379	if (next_item) {
				380	rb_replace_node(&rmap_item->node,
				381	&next_item->node,
				382	&root_stable_tree);
				383	next_item->address \|= NODE_FLAG;
				384	} else {
				385	rb_erase(&rmap_item->node, &root_stable_tree);
				386	ksm_kernel_pages_allocated--;
				387	}
				388	} else {
				389	struct rmap_item *prev_item = rmap_item->prev;
				390
				391	BUG_ON(prev_item->next != rmap_item);
				392	prev_item->next = next_item;
				393	if (next_item) {
				394	BUG_ON(next_item->prev != rmap_item);
				395	next_item->prev = rmap_item->prev;
				396	}
				397	}
				398
				399	rmap_item->next = NULL;
				400	ksm_pages_shared--;
				401
				402	} else if (rmap_item->address & NODE_FLAG) {
				403	unsigned char age;
				404	/*
				405	* ksm_thread can and must skip the rb_erase, because
				406	* root_unstable_tree was already reset to RB_ROOT.
				407	* But __ksm_exit has to be careful: do the rb_erase
				408	* if it's interrupting a scan, and this rmap_item was
				409	* inserted by this scan rather than left from before.
				410	*
				411	* Because of the case in which remove_mm_from_lists
				412	* increments seqnr before removing rmaps, unstable_nr
				413	* may even be 2 behind seqnr, but should never be
				414	* further behind. Yes, I did have trouble with this!
				415	*/
				416	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
				417	BUG_ON(age > 2);
				418	if (!age)
				419	rb_erase(&rmap_item->node, &root_unstable_tree);
				420	}
				421
				422	rmap_item->address &= PAGE_MASK;
				423
				424	cond_resched(); /* we're called from many long loops */
				425	}
				426
				427	static void remove_all_slot_rmap_items(struct mm_slot *mm_slot)
				428	{
				429	struct rmap_item rmap_item, node;
				430
				431	list_for_each_entry_safe(rmap_item, node, &mm_slot->rmap_list, link) {
				432	remove_rmap_item_from_tree(rmap_item);
				433	list_del(&rmap_item->link);
				434	free_rmap_item(rmap_item);
				435	}
				436	}
				437
				438	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
				439	struct list_head *cur)
				440	{
				441	struct rmap_item *rmap_item;
				442
				443	while (cur != &mm_slot->rmap_list) {
				444	rmap_item = list_entry(cur, struct rmap_item, link);
				445	cur = cur->next;
				446	remove_rmap_item_from_tree(rmap_item);
				447	list_del(&rmap_item->link);
				448	free_rmap_item(rmap_item);
				449	}
				450	}
				451
				452	/*
				453	* Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
				454	* than check every pte of a given vma, the locking doesn't quite work for
				455	* that - an rmap_item is assigned to the stable tree after inserting ksm
				456	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				457	* rmap_items from parent to child at fork time (so as not to waste time
				458	* if exit comes before the next scan reaches it).
				459	*/
				460	static void unmerge_ksm_pages(struct vm_area_struct *vma,
				461	unsigned long start, unsigned long end)
				462	{
				463	unsigned long addr;
				464
				465	for (addr = start; addr < end; addr += PAGE_SIZE)
				466	break_ksm(vma, addr);
				467	}
				468
				469	static void unmerge_and_remove_all_rmap_items(void)
				470	{
				471	struct mm_slot *mm_slot;
				472	struct mm_struct *mm;
				473	struct vm_area_struct *vma;
				474
				475	list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) {
				476	mm = mm_slot->mm;
				477	down_read(&mm->mmap_sem);
				478	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				479	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				480	continue;
				481	unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
				482	}
				483	remove_all_slot_rmap_items(mm_slot);
				484	up_read(&mm->mmap_sem);
				485	}
				486
				487	spin_lock(&ksm_mmlist_lock);
				488	if (ksm_scan.mm_slot != &ksm_mm_head) {
				489	ksm_scan.mm_slot = &ksm_mm_head;
				490	ksm_scan.seqnr++;
				491	}
				492	spin_unlock(&ksm_mmlist_lock);
				493	}
				494
				495	static void remove_mm_from_lists(struct mm_struct *mm)
				496	{
				497	struct mm_slot *mm_slot;
				498
				499	spin_lock(&ksm_mmlist_lock);
				500	mm_slot = get_mm_slot(mm);
				501
				502	/*
				503	* This mm_slot is always at the scanning cursor when we're
				504	* called from scan_get_next_rmap_item; but it's a special
				505	* case when we're called from __ksm_exit.
				506	*/
				507	if (ksm_scan.mm_slot == mm_slot) {
				508	ksm_scan.mm_slot = list_entry(
				509	mm_slot->mm_list.next, struct mm_slot, mm_list);
				510	ksm_scan.address = 0;
				511	ksm_scan.rmap_item = list_entry(
				512	&ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
				513	if (ksm_scan.mm_slot == &ksm_mm_head)
				514	ksm_scan.seqnr++;
				515	}
				516
				517	hlist_del(&mm_slot->link);
				518	list_del(&mm_slot->mm_list);
				519	spin_unlock(&ksm_mmlist_lock);
				520
				521	remove_all_slot_rmap_items(mm_slot);
				522	free_mm_slot(mm_slot);
				523	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				524	}
				525
				526	static u32 calc_checksum(struct page *page)
				527	{
				528	u32 checksum;
				529	void *addr = kmap_atomic(page, KM_USER0);
				530	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
				531	kunmap_atomic(addr, KM_USER0);
				532	return checksum;
				533	}
				534
				535	static int memcmp_pages(struct page page1, struct page page2)
				536	{
				537	char addr1, addr2;
				538	int ret;
				539
				540	addr1 = kmap_atomic(page1, KM_USER0);
				541	addr2 = kmap_atomic(page2, KM_USER1);
				542	ret = memcmp(addr1, addr2, PAGE_SIZE);
				543	kunmap_atomic(addr2, KM_USER1);
				544	kunmap_atomic(addr1, KM_USER0);
				545	return ret;
				546	}
				547
				548	static inline int pages_identical(struct page page1, struct page page2)
				549	{
				550	return !memcmp_pages(page1, page2);
				551	}
				552
				553	static int write_protect_page(struct vm_area_struct vma, struct page page,
				554	pte_t *orig_pte)
				555	{
				556	struct mm_struct *mm = vma->vm_mm;
				557	unsigned long addr;
				558	pte_t *ptep;
				559	spinlock_t *ptl;
				560	int swapped;
				561	int err = -EFAULT;
				562
				563	addr = page_address_in_vma(page, vma);
				564	if (addr == -EFAULT)
				565	goto out;
				566
				567	ptep = page_check_address(page, mm, addr, &ptl, 0);
				568	if (!ptep)
				569	goto out;
				570
				571	if (pte_write(*ptep)) {
				572	pte_t entry;
				573
				574	swapped = PageSwapCache(page);
				575	flush_cache_page(vma, addr, page_to_pfn(page));
				576	/*
				577	* Ok this is tricky, when get_user_pages_fast() run it doesnt
				578	* take any lock, therefore the check that we are going to make
				579	* with the pagecount against the mapcount is racey and
				580	* O_DIRECT can happen right after the check.
				581	* So we clear the pte and flush the tlb before the check
				582	* this assure us that no O_DIRECT can happen after the check
				583	* or in the middle of the check.
				584	*/
				585	entry = ptep_clear_flush(vma, addr, ptep);
				586	/*
				587	* Check that no O_DIRECT or similar I/O is in progress on the
				588	* page
				589	*/
				590	if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
				591	set_pte_at_notify(mm, addr, ptep, entry);
				592	goto out_unlock;
				593	}
				594	entry = pte_wrprotect(entry);
				595	set_pte_at_notify(mm, addr, ptep, entry);
				596	}
				597	orig_pte = ptep;
				598	err = 0;
				599
				600	out_unlock:
				601	pte_unmap_unlock(ptep, ptl);
				602	out:
				603	return err;
				604	}
				605
				606	/**
				607	* replace_page - replace page in vma by new ksm page
				608	* @vma: vma that holds the pte pointing to oldpage
				609	* @oldpage: the page we are replacing by newpage
				610	* @newpage: the ksm page we replace oldpage by
				611	* @orig_pte: the original value of the pte
				612	*
				613	* Returns 0 on success, -EFAULT on failure.
				614	*/
				615	static int replace_page(struct vm_area_struct vma, struct page oldpage,
				616	struct page *newpage, pte_t orig_pte)
				617	{
				618	struct mm_struct *mm = vma->vm_mm;
				619	pgd_t *pgd;
				620	pud_t *pud;
				621	pmd_t *pmd;
				622	pte_t *ptep;
				623	spinlock_t *ptl;
				624	unsigned long addr;
				625	pgprot_t prot;
				626	int err = -EFAULT;
				627
				628	prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
				629
				630	addr = page_address_in_vma(oldpage, vma);
				631	if (addr == -EFAULT)
				632	goto out;
				633
				634	pgd = pgd_offset(mm, addr);
				635	if (!pgd_present(*pgd))
				636	goto out;
				637
				638	pud = pud_offset(pgd, addr);
				639	if (!pud_present(*pud))
				640	goto out;
				641
				642	pmd = pmd_offset(pud, addr);
				643	if (!pmd_present(*pmd))
				644	goto out;
				645
				646	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				647	if (!pte_same(*ptep, orig_pte)) {
				648	pte_unmap_unlock(ptep, ptl);
				649	goto out;
				650	}
				651
				652	get_page(newpage);
				653	page_add_ksm_rmap(newpage);
				654
				655	flush_cache_page(vma, addr, pte_pfn(*ptep));
				656	ptep_clear_flush(vma, addr, ptep);
				657	set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
				658
				659	page_remove_rmap(oldpage);
				660	put_page(oldpage);
				661
				662	pte_unmap_unlock(ptep, ptl);
				663	err = 0;
				664	out:
				665	return err;
				666	}
				667
				668	/*
				669	* try_to_merge_one_page - take two pages and merge them into one
				670	* @vma: the vma that hold the pte pointing into oldpage
				671	* @oldpage: the page that we want to replace with newpage
				672	* @newpage: the page that we want to map instead of oldpage
				673	*
				674	* Note:
				675	* oldpage should be a PageAnon page, while newpage should be a PageKsm page,
				676	* or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
				677	*
				678	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				679	*/
				680	static int try_to_merge_one_page(struct vm_area_struct *vma,
				681	struct page *oldpage,
				682	struct page *newpage)
				683	{
				684	pte_t orig_pte = __pte(0);
				685	int err = -EFAULT;
				686
				687	if (!(vma->vm_flags & VM_MERGEABLE))
				688	goto out;
				689
				690	if (!PageAnon(oldpage))
				691	goto out;
				692
				693	get_page(newpage);
				694	get_page(oldpage);
				695
				696	/*
				697	* We need the page lock to read a stable PageSwapCache in
				698	* write_protect_page(). We use trylock_page() instead of
				699	* lock_page() because we don't want to wait here - we
				700	* prefer to continue scanning and merging different pages,
				701	* then come back to this page when it is unlocked.
				702	*/
				703	if (!trylock_page(oldpage))
				704	goto out_putpage;
				705	/*
				706	* If this anonymous page is mapped only here, its pte may need
				707	* to be write-protected. If it's mapped elsewhere, all of its
				708	* ptes are necessarily already write-protected. But in either
				709	* case, we need to lock and check page_count is not raised.
				710	*/
				711	if (write_protect_page(vma, oldpage, &orig_pte)) {
				712	unlock_page(oldpage);
				713	goto out_putpage;
				714	}
				715	unlock_page(oldpage);
				716
				717	if (pages_identical(oldpage, newpage))
				718	err = replace_page(vma, oldpage, newpage, orig_pte);
				719
				720	out_putpage:
				721	put_page(oldpage);
				722	put_page(newpage);
				723	out:
				724	return err;
				725	}
				726
				727	/*
				728	* try_to_merge_two_pages - take two identical pages and prepare them
				729	* to be merged into one page.
				730	*
				731	* This function returns 0 if we successfully mapped two identical pages
				732	* into one page, -EFAULT otherwise.
				733	*
				734	* Note that this function allocates a new kernel page: if one of the pages
				735	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				736	*/
				737	static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
				738	struct page page1, struct mm_struct mm2,
				739	unsigned long addr2, struct page *page2)
				740	{
				741	struct vm_area_struct *vma;
				742	struct page *kpage;
				743	int err = -EFAULT;
				744
				745	/*
				746	* The number of nodes in the stable tree
				747	* is the number of kernel pages that we hold.
				748	*/
				749	if (ksm_max_kernel_pages &&
				750	ksm_max_kernel_pages <= ksm_kernel_pages_allocated)
				751	return err;
				752
				753	kpage = alloc_page(GFP_HIGHUSER);
				754	if (!kpage)
				755	return err;
				756
				757	down_read(&mm1->mmap_sem);
				758	vma = find_vma(mm1, addr1);
				759	if (!vma \|\| vma->vm_start > addr1) {
				760	put_page(kpage);
				761	up_read(&mm1->mmap_sem);
				762	return err;
				763	}
				764
				765	copy_user_highpage(kpage, page1, addr1, vma);
				766	err = try_to_merge_one_page(vma, page1, kpage);
				767	up_read(&mm1->mmap_sem);
				768
				769	if (!err) {
				770	down_read(&mm2->mmap_sem);
				771	vma = find_vma(mm2, addr2);
				772	if (!vma \|\| vma->vm_start > addr2) {
				773	put_page(kpage);
				774	up_read(&mm2->mmap_sem);
				775	break_cow(mm1, addr1);
				776	return -EFAULT;
				777	}
				778
				779	err = try_to_merge_one_page(vma, page2, kpage);
				780	up_read(&mm2->mmap_sem);
				781
				782	/*
				783	* If the second try_to_merge_one_page failed, we have a
				784	* ksm page with just one pte pointing to it, so break it.
				785	*/
				786	if (err)
				787	break_cow(mm1, addr1);
				788	else
				789	ksm_pages_shared += 2;
				790	}
				791
				792	put_page(kpage);
				793	return err;
				794	}
				795
				796	/*
				797	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				798	* but no new kernel page is allocated: kpage must already be a ksm page.
				799	*/
				800	static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
				801	unsigned long addr1,
				802	struct page *page1,
				803	struct page *kpage)
				804	{
				805	struct vm_area_struct *vma;
				806	int err = -EFAULT;
				807
				808	down_read(&mm1->mmap_sem);
				809	vma = find_vma(mm1, addr1);
				810	if (!vma \|\| vma->vm_start > addr1) {
				811	up_read(&mm1->mmap_sem);
				812	return err;
				813	}
				814
				815	err = try_to_merge_one_page(vma, page1, kpage);
				816	up_read(&mm1->mmap_sem);
				817
				818	if (!err)
				819	ksm_pages_shared++;
				820
				821	return err;
				822	}
				823
				824	/*
				825	* stable_tree_search - search page inside the stable tree
				826	* @page: the page that we are searching identical pages to.
				827	* @page2: pointer into identical page that we are holding inside the stable
				828	* tree that we have found.
				829	* @rmap_item: the reverse mapping item
				830	*
				831	* This function checks if there is a page inside the stable tree
				832	* with identical content to the page that we are scanning right now.
				833	*
				834	* This function return rmap_item pointer to the identical item if found,
				835	* NULL otherwise.
				836	*/
				837	static struct rmap_item stable_tree_search(struct page page,
				838	struct page **page2,
				839	struct rmap_item *rmap_item)
				840	{
				841	struct rb_node *node = root_stable_tree.rb_node;
				842
				843	while (node) {
				844	struct rmap_item tree_rmap_item, next_rmap_item;
				845	int ret;
				846
				847	tree_rmap_item = rb_entry(node, struct rmap_item, node);
				848	while (tree_rmap_item) {
				849	BUG_ON(!in_stable_tree(tree_rmap_item));
				850	cond_resched();
				851	page2[0] = get_ksm_page(tree_rmap_item);
				852	if (page2[0])
				853	break;
				854	next_rmap_item = tree_rmap_item->next;
				855	remove_rmap_item_from_tree(tree_rmap_item);
				856	tree_rmap_item = next_rmap_item;
				857	}
				858	if (!tree_rmap_item)
				859	return NULL;
				860
				861	ret = memcmp_pages(page, page2[0]);
				862
				863	if (ret < 0) {
				864	put_page(page2[0]);
				865	node = node->rb_left;
				866	} else if (ret > 0) {
				867	put_page(page2[0]);
				868	node = node->rb_right;
				869	} else {
				870	return tree_rmap_item;
				871	}
				872	}
				873
				874	return NULL;
				875	}
				876
				877	/*
				878	* stable_tree_insert - insert rmap_item pointing to new ksm page
				879	* into the stable tree.
				880	*
				881	* @page: the page that we are searching identical page to inside the stable
				882	* tree.
				883	* @rmap_item: pointer to the reverse mapping item.
				884	*
				885	* This function returns rmap_item if success, NULL otherwise.
				886	*/
				887	static struct rmap_item stable_tree_insert(struct page page,
				888	struct rmap_item *rmap_item)
				889	{
				890	struct rb_node **new = &root_stable_tree.rb_node;
				891	struct rb_node *parent = NULL;
				892
				893	while (*new) {
				894	struct rmap_item tree_rmap_item, next_rmap_item;
				895	struct page *tree_page;
				896	int ret;
				897
				898	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				899	while (tree_rmap_item) {
				900	BUG_ON(!in_stable_tree(tree_rmap_item));
				901	cond_resched();
				902	tree_page = get_ksm_page(tree_rmap_item);
				903	if (tree_page)
				904	break;
				905	next_rmap_item = tree_rmap_item->next;
				906	remove_rmap_item_from_tree(tree_rmap_item);
				907	tree_rmap_item = next_rmap_item;
				908	}
				909	if (!tree_rmap_item)
				910	return NULL;
				911
				912	ret = memcmp_pages(page, tree_page);
				913	put_page(tree_page);
				914
				915	parent = *new;
				916	if (ret < 0)
				917	new = &parent->rb_left;
				918	else if (ret > 0)
				919	new = &parent->rb_right;
				920	else {
				921	/*
				922	* It is not a bug that stable_tree_search() didn't
				923	* find this node: because at that time our page was
				924	* not yet write-protected, so may have changed since.
				925	*/
				926	return NULL;
				927	}
				928	}
				929
				930	ksm_kernel_pages_allocated++;
				931
				932	rmap_item->address \|= NODE_FLAG \| STABLE_FLAG;
				933	rmap_item->next = NULL;
				934	rb_link_node(&rmap_item->node, parent, new);
				935	rb_insert_color(&rmap_item->node, &root_stable_tree);
				936
				937	return rmap_item;
				938	}
				939
				940	/*
				941	* unstable_tree_search_insert - search and insert items into the unstable tree.
				942	*
				943	* @page: the page that we are going to search for identical page or to insert
				944	* into the unstable tree
				945	* @page2: pointer into identical page that was found inside the unstable tree
				946	* @rmap_item: the reverse mapping item of page
				947	*
				948	* This function searches for a page in the unstable tree identical to the
				949	* page currently being scanned; and if no identical page is found in the
				950	* tree, we insert rmap_item as a new object into the unstable tree.
				951	*
				952	* This function returns pointer to rmap_item found to be identical
				953	* to the currently scanned page, NULL otherwise.
				954	*
				955	* This function does both searching and inserting, because they share
				956	* the same walking algorithm in an rbtree.
				957	*/
				958	static struct rmap_item unstable_tree_search_insert(struct page page,
				959	struct page **page2,
				960	struct rmap_item *rmap_item)
				961	{
				962	struct rb_node **new = &root_unstable_tree.rb_node;
				963	struct rb_node *parent = NULL;
				964
				965	while (*new) {
				966	struct rmap_item *tree_rmap_item;
				967	int ret;
				968
				969	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				970	page2[0] = get_mergeable_page(tree_rmap_item);
				971	if (!page2[0])
				972	return NULL;
				973
				974	/*
				975	* Don't substitute an unswappable ksm page
				976	* just for one good swappable forked page.
				977	*/
				978	if (page == page2[0]) {
				979	put_page(page2[0]);
				980	return NULL;
				981	}
				982
				983	ret = memcmp_pages(page, page2[0]);
				984
				985	parent = *new;
				986	if (ret < 0) {
				987	put_page(page2[0]);
				988	new = &parent->rb_left;
				989	} else if (ret > 0) {
				990	put_page(page2[0]);
				991	new = &parent->rb_right;
				992	} else {
				993	return tree_rmap_item;
				994	}
				995	}
				996
				997	rmap_item->address \|= NODE_FLAG;
				998	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
				999	rb_link_node(&rmap_item->node, parent, new);
				1000	rb_insert_color(&rmap_item->node, &root_unstable_tree);
				1001
				1002	return NULL;
				1003	}
				1004
				1005	/*
				1006	* stable_tree_append - add another rmap_item to the linked list of
				1007	* rmap_items hanging off a given node of the stable tree, all sharing
				1008	* the same ksm page.
				1009	*/
				1010	static void stable_tree_append(struct rmap_item *rmap_item,
				1011	struct rmap_item *tree_rmap_item)
				1012	{
				1013	rmap_item->next = tree_rmap_item->next;
				1014	rmap_item->prev = tree_rmap_item;
				1015
				1016	if (tree_rmap_item->next)
				1017	tree_rmap_item->next->prev = rmap_item;
				1018
				1019	tree_rmap_item->next = rmap_item;
				1020	rmap_item->address \|= STABLE_FLAG;
				1021	}
				1022
				1023	/*
				1024	* cmp_and_merge_page - take a page computes its hash value and check if there
				1025	* is similar hash value to different page,
				1026	* in case we find that there is similar hash to different page we call to
				1027	* try_to_merge_two_pages().
				1028	*
				1029	* @page: the page that we are searching identical page to.
				1030	* @rmap_item: the reverse mapping into the virtual address of this page
				1031	*/
				1032	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				1033	{
				1034	struct page *page2[1];
				1035	struct rmap_item *tree_rmap_item;
				1036	unsigned int checksum;
				1037	int err;
				1038
				1039	if (in_stable_tree(rmap_item))
				1040	remove_rmap_item_from_tree(rmap_item);
				1041
				1042	/* We first start with searching the page inside the stable tree */
				1043	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
				1044	if (tree_rmap_item) {
				1045	if (page == page2[0]) { /* forked */
				1046	ksm_pages_shared++;
				1047	err = 0;
				1048	} else
				1049	err = try_to_merge_with_ksm_page(rmap_item->mm,
				1050	rmap_item->address,
				1051	page, page2[0]);
				1052	put_page(page2[0]);
				1053
				1054	if (!err) {
				1055	/*
				1056	* The page was successfully merged:
				1057	* add its rmap_item to the stable tree.
				1058	*/
				1059	stable_tree_append(rmap_item, tree_rmap_item);
				1060	}
				1061	return;
				1062	}
				1063
				1064	/*
				1065	* A ksm page might have got here by fork, but its other
				1066	* references have already been removed from the stable tree.
				1067	*/
				1068	if (PageKsm(page))
				1069	break_cow(rmap_item->mm, rmap_item->address);
				1070
				1071	/*
				1072	* In case the hash value of the page was changed from the last time we
				1073	* have calculated it, this page to be changed frequely, therefore we
				1074	* don't want to insert it to the unstable tree, and we don't want to
				1075	* waste our time to search if there is something identical to it there.
				1076	*/
				1077	checksum = calc_checksum(page);
				1078	if (rmap_item->oldchecksum != checksum) {
				1079	rmap_item->oldchecksum = checksum;
				1080	return;
				1081	}
				1082
				1083	tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
				1084	if (tree_rmap_item) {
				1085	err = try_to_merge_two_pages(rmap_item->mm,
				1086	rmap_item->address, page,
				1087	tree_rmap_item->mm,
				1088	tree_rmap_item->address, page2[0]);
				1089	/*
				1090	* As soon as we merge this page, we want to remove the
				1091	* rmap_item of the page we have merged with from the unstable
				1092	* tree, and insert it instead as new node in the stable tree.
				1093	*/
				1094	if (!err) {
				1095	rb_erase(&tree_rmap_item->node, &root_unstable_tree);
				1096	tree_rmap_item->address &= ~NODE_FLAG;
				1097	/*
				1098	* If we fail to insert the page into the stable tree,
				1099	* we will have 2 virtual addresses that are pointing
				1100	* to a ksm page left outside the stable tree,
				1101	* in which case we need to break_cow on both.
				1102	*/
				1103	if (stable_tree_insert(page2[0], tree_rmap_item))
				1104	stable_tree_append(rmap_item, tree_rmap_item);
				1105	else {
				1106	break_cow(tree_rmap_item->mm,
				1107	tree_rmap_item->address);
				1108	break_cow(rmap_item->mm, rmap_item->address);
				1109	ksm_pages_shared -= 2;
				1110	}
				1111	}
				1112
				1113	put_page(page2[0]);
				1114	}
				1115	}
				1116
				1117	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
				1118	struct list_head *cur,
				1119	unsigned long addr)
				1120	{
				1121	struct rmap_item *rmap_item;
				1122
				1123	while (cur != &mm_slot->rmap_list) {
				1124	rmap_item = list_entry(cur, struct rmap_item, link);
				1125	if ((rmap_item->address & PAGE_MASK) == addr) {
				1126	if (!in_stable_tree(rmap_item))
				1127	remove_rmap_item_from_tree(rmap_item);
				1128	return rmap_item;
				1129	}
				1130	if (rmap_item->address > addr)
				1131	break;
				1132	cur = cur->next;
				1133	remove_rmap_item_from_tree(rmap_item);
				1134	list_del(&rmap_item->link);
				1135	free_rmap_item(rmap_item);
				1136	}
				1137
				1138	rmap_item = alloc_rmap_item();
				1139	if (rmap_item) {
				1140	/* It has already been zeroed */
				1141	rmap_item->mm = mm_slot->mm;
				1142	rmap_item->address = addr;
				1143	list_add_tail(&rmap_item->link, cur);
				1144	}
				1145	return rmap_item;
				1146	}
				1147
				1148	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				1149	{
				1150	struct mm_struct *mm;
				1151	struct mm_slot *slot;
				1152	struct vm_area_struct *vma;
				1153	struct rmap_item *rmap_item;
				1154
				1155	if (list_empty(&ksm_mm_head.mm_list))
				1156	return NULL;
				1157
				1158	slot = ksm_scan.mm_slot;
				1159	if (slot == &ksm_mm_head) {
				1160	root_unstable_tree = RB_ROOT;
				1161
				1162	spin_lock(&ksm_mmlist_lock);
				1163	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1164	ksm_scan.mm_slot = slot;
				1165	spin_unlock(&ksm_mmlist_lock);
				1166	next_mm:
				1167	ksm_scan.address = 0;
				1168	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1169	struct rmap_item, link);
				1170	}
				1171
				1172	mm = slot->mm;
				1173	down_read(&mm->mmap_sem);
				1174	for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {
				1175	if (!(vma->vm_flags & VM_MERGEABLE))
				1176	continue;
				1177	if (ksm_scan.address < vma->vm_start)
				1178	ksm_scan.address = vma->vm_start;
				1179	if (!vma->anon_vma)
				1180	ksm_scan.address = vma->vm_end;
				1181
				1182	while (ksm_scan.address < vma->vm_end) {
				1183	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
				1184	if (page && PageAnon(page)) {
				1185	flush_anon_page(vma, *page, ksm_scan.address);
				1186	flush_dcache_page(*page);
				1187	rmap_item = get_next_rmap_item(slot,
				1188	ksm_scan.rmap_item->link.next,
				1189	ksm_scan.address);
				1190	if (rmap_item) {
				1191	ksm_scan.rmap_item = rmap_item;
				1192	ksm_scan.address += PAGE_SIZE;
				1193	} else
				1194	put_page(*page);
				1195	up_read(&mm->mmap_sem);
				1196	return rmap_item;
				1197	}
				1198	if (*page)
				1199	put_page(*page);
				1200	ksm_scan.address += PAGE_SIZE;
				1201	cond_resched();
				1202	}
				1203	}
				1204
				1205	if (!ksm_scan.address) {
				1206	/*
				1207	* We've completed a full scan of all vmas, holding mmap_sem
				1208	* throughout, and found no VM_MERGEABLE: so do the same as
				1209	* __ksm_exit does to remove this mm from all our lists now.
				1210	*/
				1211	remove_mm_from_lists(mm);
				1212	up_read(&mm->mmap_sem);
				1213	slot = ksm_scan.mm_slot;
				1214	if (slot != &ksm_mm_head)
				1215	goto next_mm;
				1216	return NULL;
				1217	}
				1218
				1219	/*
				1220	* Nuke all the rmap_items that are above this current rmap:
				1221	* because there were no VM_MERGEABLE vmas with such addresses.
				1222	*/
				1223	remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
				1224	up_read(&mm->mmap_sem);
				1225
				1226	spin_lock(&ksm_mmlist_lock);
				1227	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1228	ksm_scan.mm_slot = slot;
				1229	spin_unlock(&ksm_mmlist_lock);
				1230
				1231	/* Repeat until we've completed scanning the whole list */
				1232	if (slot != &ksm_mm_head)
				1233	goto next_mm;
				1234
				1235	/*
				1236	* Bump seqnr here rather than at top, so that __ksm_exit
				1237	* can skip rb_erase on unstable tree until we run again.
				1238	*/
				1239	ksm_scan.seqnr++;
				1240	return NULL;
				1241	}
				1242
				1243	/**
				1244	* ksm_do_scan - the ksm scanner main worker function.
				1245	* @scan_npages - number of pages we want to scan before we return.
				1246	*/
				1247	static void ksm_do_scan(unsigned int scan_npages)
				1248	{
				1249	struct rmap_item *rmap_item;
				1250	struct page *page;
				1251
				1252	while (scan_npages--) {
				1253	cond_resched();
				1254	rmap_item = scan_get_next_rmap_item(&page);
				1255	if (!rmap_item)
				1256	return;
				1257	if (!PageKsm(page) \|\| !in_stable_tree(rmap_item))
				1258	cmp_and_merge_page(page, rmap_item);
				1259	put_page(page);
				1260	}
				1261	}
				1262
				1263	static int ksm_scan_thread(void *nothing)
				1264	{
				1265	set_user_nice(current, 0);
				1266
				1267	while (!kthread_should_stop()) {
				1268	if (ksm_run & KSM_RUN_MERGE) {
				1269	mutex_lock(&ksm_thread_mutex);
				1270	ksm_do_scan(ksm_thread_pages_to_scan);
				1271	mutex_unlock(&ksm_thread_mutex);
				1272	schedule_timeout_interruptible(
				1273	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				1274	} else {
				1275	wait_event_interruptible(ksm_thread_wait,
				1276	(ksm_run & KSM_RUN_MERGE) \|\|
				1277	kthread_should_stop());
				1278	}
				1279	}
				1280	return 0;
				1281	}
				1282
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1283	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				1284	unsigned long end, int advice, unsigned long *vm_flags)
				1285	{
				1286	struct mm_struct *mm = vma->vm_mm;
				1287
				1288	switch (advice) {
				1289	case MADV_MERGEABLE:
				1290	/*
				1291	* Be somewhat over-protective for now!
				1292	*/
				1293	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				1294	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
				1295	VM_RESERVED \| VM_HUGETLB \| VM_INSERTPAGE \|
				1296	VM_MIXEDMAP \| VM_SAO))
				1297	return 0; /* just ignore the advice */
				1298
				1299	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
				1300	if (__ksm_enter(mm) < 0)
				1301	return -EAGAIN;
				1302
				1303	*vm_flags \|= VM_MERGEABLE;
				1304	break;
				1305
				1306	case MADV_UNMERGEABLE:
				1307	if (!(*vm_flags & VM_MERGEABLE))
				1308	return 0; /* just ignore the advice */
				1309
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	1310	if (vma->anon_vma)
				1311	unmerge_ksm_pages(vma, start, end);
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1312
				1313	*vm_flags &= ~VM_MERGEABLE;
				1314	break;
				1315	}
				1316
				1317	return 0;
				1318	}
				1319
				1320	int __ksm_enter(struct mm_struct *mm)
				1321	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	1322	struct mm_slot *mm_slot = alloc_mm_slot();
				1323	if (!mm_slot)
				1324	return -ENOMEM;
				1325
				1326	spin_lock(&ksm_mmlist_lock);
				1327	insert_to_mm_slots_hash(mm, mm_slot);
				1328	/*
				1329	* Insert just behind the scanning cursor, to let the area settle
				1330	* down a little; when fork is followed by immediate exec, we don't
				1331	* want ksmd to waste time setting up and tearing down an rmap_list.
				1332	*/
				1333	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
				1334	spin_unlock(&ksm_mmlist_lock);
				1335
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1336	set_bit(MMF_VM_MERGEABLE, &mm->flags);
				1337	return 0;
				1338	}
				1339
				1340	void __ksm_exit(struct mm_struct *mm)
				1341	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	1342	/*
				1343	* This process is exiting: doesn't hold and doesn't need mmap_sem;
				1344	* but we do need to exclude ksmd and other exiters while we modify
				1345	* the various lists and trees.
				1346	*/
				1347	mutex_lock(&ksm_thread_mutex);
				1348	remove_mm_from_lists(mm);
				1349	mutex_unlock(&ksm_thread_mutex);
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1350	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame^]	1351
				1352	#define KSM_ATTR_RO(_name) \
				1353	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				1354	#define KSM_ATTR(_name) \
				1355	static struct kobj_attribute _name##_attr = \
				1356	__ATTR(_name, 0644, _name##_show, _name##_store)
				1357
				1358	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				1359	struct kobj_attribute attr, char buf)
				1360	{
				1361	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				1362	}
				1363
				1364	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				1365	struct kobj_attribute *attr,
				1366	const char *buf, size_t count)
				1367	{
				1368	unsigned long msecs;
				1369	int err;
				1370
				1371	err = strict_strtoul(buf, 10, &msecs);
				1372	if (err \|\| msecs > UINT_MAX)
				1373	return -EINVAL;
				1374
				1375	ksm_thread_sleep_millisecs = msecs;
				1376
				1377	return count;
				1378	}
				1379	KSM_ATTR(sleep_millisecs);
				1380
				1381	static ssize_t pages_to_scan_show(struct kobject *kobj,
				1382	struct kobj_attribute attr, char buf)
				1383	{
				1384	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				1385	}
				1386
				1387	static ssize_t pages_to_scan_store(struct kobject *kobj,
				1388	struct kobj_attribute *attr,
				1389	const char *buf, size_t count)
				1390	{
				1391	int err;
				1392	unsigned long nr_pages;
				1393
				1394	err = strict_strtoul(buf, 10, &nr_pages);
				1395	if (err \|\| nr_pages > UINT_MAX)
				1396	return -EINVAL;
				1397
				1398	ksm_thread_pages_to_scan = nr_pages;
				1399
				1400	return count;
				1401	}
				1402	KSM_ATTR(pages_to_scan);
				1403
				1404	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				1405	char *buf)
				1406	{
				1407	return sprintf(buf, "%u\n", ksm_run);
				1408	}
				1409
				1410	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				1411	const char *buf, size_t count)
				1412	{
				1413	int err;
				1414	unsigned long flags;
				1415
				1416	err = strict_strtoul(buf, 10, &flags);
				1417	if (err \|\| flags > UINT_MAX)
				1418	return -EINVAL;
				1419	if (flags > KSM_RUN_UNMERGE)
				1420	return -EINVAL;
				1421
				1422	/*
				1423	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				1424	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
				1425	* breaking COW to free the kernel_pages_allocated (but leaves
				1426	* mm_slots on the list for when ksmd may be set running again).
				1427	*/
				1428
				1429	mutex_lock(&ksm_thread_mutex);
				1430	if (ksm_run != flags) {
				1431	ksm_run = flags;
				1432	if (flags & KSM_RUN_UNMERGE)
				1433	unmerge_and_remove_all_rmap_items();
				1434	}
				1435	mutex_unlock(&ksm_thread_mutex);
				1436
				1437	if (flags & KSM_RUN_MERGE)
				1438	wake_up_interruptible(&ksm_thread_wait);
				1439
				1440	return count;
				1441	}
				1442	KSM_ATTR(run);
				1443
				1444	static ssize_t pages_shared_show(struct kobject *kobj,
				1445	struct kobj_attribute attr, char buf)
				1446	{
				1447	return sprintf(buf, "%lu\n",
				1448	ksm_pages_shared - ksm_kernel_pages_allocated);
				1449	}
				1450	KSM_ATTR_RO(pages_shared);
				1451
				1452	static ssize_t kernel_pages_allocated_show(struct kobject *kobj,
				1453	struct kobj_attribute *attr,
				1454	char *buf)
				1455	{
				1456	return sprintf(buf, "%lu\n", ksm_kernel_pages_allocated);
				1457	}
				1458	KSM_ATTR_RO(kernel_pages_allocated);
				1459
				1460	static ssize_t max_kernel_pages_store(struct kobject *kobj,
				1461	struct kobj_attribute *attr,
				1462	const char *buf, size_t count)
				1463	{
				1464	int err;
				1465	unsigned long nr_pages;
				1466
				1467	err = strict_strtoul(buf, 10, &nr_pages);
				1468	if (err)
				1469	return -EINVAL;
				1470
				1471	ksm_max_kernel_pages = nr_pages;
				1472
				1473	return count;
				1474	}
				1475
				1476	static ssize_t max_kernel_pages_show(struct kobject *kobj,
				1477	struct kobj_attribute attr, char buf)
				1478	{
				1479	return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
				1480	}
				1481	KSM_ATTR(max_kernel_pages);
				1482
				1483	static struct attribute *ksm_attrs[] = {
				1484	&sleep_millisecs_attr.attr,
				1485	&pages_to_scan_attr.attr,
				1486	&run_attr.attr,
				1487	&pages_shared_attr.attr,
				1488	&kernel_pages_allocated_attr.attr,
				1489	&max_kernel_pages_attr.attr,
				1490	NULL,
				1491	};
				1492
				1493	static struct attribute_group ksm_attr_group = {
				1494	.attrs = ksm_attrs,
				1495	.name = "ksm",
				1496	};
				1497
				1498	static int __init ksm_init(void)
				1499	{
				1500	struct task_struct *ksm_thread;
				1501	int err;
				1502
				1503	err = ksm_slab_init();
				1504	if (err)
				1505	goto out;
				1506
				1507	err = mm_slots_hash_init();
				1508	if (err)
				1509	goto out_free1;
				1510
				1511	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				1512	if (IS_ERR(ksm_thread)) {
				1513	printk(KERN_ERR "ksm: creating kthread failed\n");
				1514	err = PTR_ERR(ksm_thread);
				1515	goto out_free2;
				1516	}
				1517
				1518	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				1519	if (err) {
				1520	printk(KERN_ERR "ksm: register sysfs failed\n");
				1521	goto out_free3;
				1522	}
				1523
				1524	return 0;
				1525
				1526	out_free3:
				1527	kthread_stop(ksm_thread);
				1528	out_free2:
				1529	mm_slots_hash_free();
				1530	out_free1:
				1531	ksm_slab_free();
				1532	out:
				1533	return err;
				1534	}
				1535	module_init(ksm_init)