Blame - mm/ksm.c - kernel/msm-4.9

blob: c49bb7156a1d3fc7ba098dfaa3f69efbbc9aa488 [file] [log] [blame]

Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	7	* Copyright (C) 2008-2009 Red Hat, Inc.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	12	* Hugh Dickins
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2.
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	15	*/
				16
				17	#include <linux/errno.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	18	#include <linux/mm.h>
				19	#include <linux/fs.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	20	#include <linux/mman.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	21	#include <linux/sched.h>
				22	#include <linux/rwsem.h>
				23	#include <linux/pagemap.h>
				24	#include <linux/rmap.h>
				25	#include <linux/spinlock.h>
				26	#include <linux/jhash.h>
				27	#include <linux/delay.h>
				28	#include <linux/kthread.h>
				29	#include <linux/wait.h>
				30	#include <linux/slab.h>
				31	#include <linux/rbtree.h>
				32	#include <linux/mmu_notifier.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	33	#include <linux/ksm.h>
				34
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	35	#include <asm/tlbflush.h>
				36
				37	/*
				38	* A few notes about the KSM scanning process,
				39	* to make it easier to understand the data structures below:
				40	*
				41	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				42	* contents into a data structure that holds pointers to the pages' locations.
				43	*
				44	* Since the contents of the pages may change at any moment, KSM cannot just
				45	* insert the pages into a normal sorted tree and expect it to find anything.
				46	* Therefore KSM uses two data structures - the stable and the unstable tree.
				47	*
				48	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				49	* by their contents. Because each such page is write-protected, searching on
				50	* this tree is fully assured to be working (except when pages are unmapped),
				51	* and therefore this tree is called the stable tree.
				52	*
				53	* In addition to the stable tree, KSM uses a second data structure called the
				54	* unstable tree: this tree holds pointers to pages which have been found to
				55	* be "unchanged for a period of time". The unstable tree sorts these pages
				56	* by their contents, but since they are not write-protected, KSM cannot rely
				57	* upon the unstable tree to work correctly - the unstable tree is liable to
				58	* be corrupted as its contents are modified, and so it is called unstable.
				59	*
				60	* KSM solves this problem by several techniques:
				61	*
				62	* 1) The unstable tree is flushed every time KSM completes scanning all
				63	* memory areas, and then the tree is rebuilt again from the beginning.
				64	* 2) KSM will only insert into the unstable tree, pages whose hash value
				65	* has not changed since the previous scan of all memory areas.
				66	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				67	* colors of the nodes and not on their contents, assuring that even when
				68	* the tree gets "corrupted" it won't get out of balance, so scanning time
				69	* remains the same (also, searching and inserting nodes in an rbtree uses
				70	* the same algorithm, so we have no overhead when we flush and rebuild).
				71	* 4) KSM never flushes the stable tree, which means that even if it were to
				72	* take 10 attempts to find a page in the unstable tree, once it is found,
				73	* it is secured in the stable tree. (When we scan a new page, we first
				74	* compare it against the stable tree, and then against the unstable tree.)
				75	*/
				76
				77	/**
				78	* struct mm_slot - ksm information per mm that is being scanned
				79	* @link: link to the mm_slots hash list
				80	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
				81	* @rmap_list: head for this mm_slot's list of rmap_items
				82	* @mm: the mm that this information is valid for
				83	*/
				84	struct mm_slot {
				85	struct hlist_node link;
				86	struct list_head mm_list;
				87	struct list_head rmap_list;
				88	struct mm_struct *mm;
				89	};
				90
				91	/**
				92	* struct ksm_scan - cursor for scanning
				93	* @mm_slot: the current mm_slot we are scanning
				94	* @address: the next address inside that to be scanned
				95	* @rmap_item: the current rmap that we are scanning inside the rmap_list
				96	* @seqnr: count of completed full scans (needed when removing unstable node)
				97	*
				98	* There is only the one ksm_scan instance of this cursor structure.
				99	*/
				100	struct ksm_scan {
				101	struct mm_slot *mm_slot;
				102	unsigned long address;
				103	struct rmap_item *rmap_item;
				104	unsigned long seqnr;
				105	};
				106
				107	/**
				108	* struct rmap_item - reverse mapping item for virtual addresses
				109	* @link: link into mm_slot's rmap_list (rmap_list is per mm)
				110	* @mm: the memory structure this rmap_item is pointing into
				111	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				112	* @oldchecksum: previous checksum of the page at that virtual address
				113	* @node: rb_node of this rmap_item in either unstable or stable tree
				114	* @next: next rmap_item hanging off the same node of the stable tree
				115	* @prev: previous rmap_item hanging off the same node of the stable tree
				116	*/
				117	struct rmap_item {
				118	struct list_head link;
				119	struct mm_struct *mm;
				120	unsigned long address; /* + low bits used for flags below */
				121	union {
				122	unsigned int oldchecksum; /* when unstable */
				123	struct rmap_item next; / when stable */
				124	};
				125	union {
				126	struct rb_node node; /* when tree node */
				127	struct rmap_item prev; / in stable list */
				128	};
				129	};
				130
				131	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
				132	#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
				133	#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
				134
				135	/* The stable and unstable tree heads */
				136	static struct rb_root root_stable_tree = RB_ROOT;
				137	static struct rb_root root_unstable_tree = RB_ROOT;
				138
				139	#define MM_SLOTS_HASH_HEADS 1024
				140	static struct hlist_head *mm_slots_hash;
				141
				142	static struct mm_slot ksm_mm_head = {
				143	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				144	};
				145	static struct ksm_scan ksm_scan = {
				146	.mm_slot = &ksm_mm_head,
				147	};
				148
				149	static struct kmem_cache *rmap_item_cache;
				150	static struct kmem_cache *mm_slot_cache;
				151
				152	/* The number of nodes in the stable tree */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	153	static unsigned long ksm_pages_shared;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	154
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	155	/* The number of page slots additionally sharing those nodes */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	156	static unsigned long ksm_pages_sharing;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	157
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	158	/* The number of nodes in the unstable tree */
				159	static unsigned long ksm_pages_unshared;
				160
				161	/* The number of rmap_items in use: to calculate pages_volatile */
				162	static unsigned long ksm_rmap_items;
				163
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	164	/* Limit on the number of unswappable pages used */
				165	static unsigned long ksm_max_kernel_pages;
				166
				167	/* Number of pages ksmd should scan in one batch */
				168	static unsigned int ksm_thread_pages_to_scan;
				169
				170	/* Milliseconds ksmd should sleep between batches */
				171	static unsigned int ksm_thread_sleep_millisecs;
				172
				173	#define KSM_RUN_STOP 0
				174	#define KSM_RUN_MERGE 1
				175	#define KSM_RUN_UNMERGE 2
				176	static unsigned int ksm_run;
				177
				178	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				179	static DEFINE_MUTEX(ksm_thread_mutex);
				180	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				181
				182	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				183	sizeof(struct __struct), __alignof__(struct __struct),\
				184	(__flags), NULL)
				185
				186	static int __init ksm_slab_init(void)
				187	{
				188	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				189	if (!rmap_item_cache)
				190	goto out;
				191
				192	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				193	if (!mm_slot_cache)
				194	goto out_free;
				195
				196	return 0;
				197
				198	out_free:
				199	kmem_cache_destroy(rmap_item_cache);
				200	out:
				201	return -ENOMEM;
				202	}
				203
				204	static void __init ksm_slab_free(void)
				205	{
				206	kmem_cache_destroy(mm_slot_cache);
				207	kmem_cache_destroy(rmap_item_cache);
				208	mm_slot_cache = NULL;
				209	}
				210
				211	static inline struct rmap_item *alloc_rmap_item(void)
				212	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	213	struct rmap_item *rmap_item;
				214
				215	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
				216	if (rmap_item)
				217	ksm_rmap_items++;
				218	return rmap_item;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	219	}
				220
				221	static inline void free_rmap_item(struct rmap_item *rmap_item)
				222	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	223	ksm_rmap_items--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	224	rmap_item->mm = NULL; /* debug safety */
				225	kmem_cache_free(rmap_item_cache, rmap_item);
				226	}
				227
				228	static inline struct mm_slot *alloc_mm_slot(void)
				229	{
				230	if (!mm_slot_cache) /* initialization failed */
				231	return NULL;
				232	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				233	}
				234
				235	static inline void free_mm_slot(struct mm_slot *mm_slot)
				236	{
				237	kmem_cache_free(mm_slot_cache, mm_slot);
				238	}
				239
				240	static int __init mm_slots_hash_init(void)
				241	{
				242	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
				243	GFP_KERNEL);
				244	if (!mm_slots_hash)
				245	return -ENOMEM;
				246	return 0;
				247	}
				248
				249	static void __init mm_slots_hash_free(void)
				250	{
				251	kfree(mm_slots_hash);
				252	}
				253
				254	static struct mm_slot get_mm_slot(struct mm_struct mm)
				255	{
				256	struct mm_slot *mm_slot;
				257	struct hlist_head *bucket;
				258	struct hlist_node *node;
				259
				260	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				261	% MM_SLOTS_HASH_HEADS];
				262	hlist_for_each_entry(mm_slot, node, bucket, link) {
				263	if (mm == mm_slot->mm)
				264	return mm_slot;
				265	}
				266	return NULL;
				267	}
				268
				269	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				270	struct mm_slot *mm_slot)
				271	{
				272	struct hlist_head *bucket;
				273
				274	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				275	% MM_SLOTS_HASH_HEADS];
				276	mm_slot->mm = mm;
				277	INIT_LIST_HEAD(&mm_slot->rmap_list);
				278	hlist_add_head(&mm_slot->link, bucket);
				279	}
				280
				281	static inline int in_stable_tree(struct rmap_item *rmap_item)
				282	{
				283	return rmap_item->address & STABLE_FLAG;
				284	}
				285
				286	/*
				287	* We use break_ksm to break COW on a ksm page: it's a stripped down
				288	*
				289	* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
				290	* put_page(page);
				291	*
				292	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				293	* in case the application has unmapped and remapped mm,addr meanwhile.
				294	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				295	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
				296	*/
				297	static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
				298	{
				299	struct page *page;
				300	int ret;
				301
				302	do {
				303	cond_resched();
				304	page = follow_page(vma, addr, FOLL_GET);
				305	if (!page)
				306	break;
				307	if (PageKsm(page))
				308	ret = handle_mm_fault(vma->vm_mm, vma, addr,
				309	FAULT_FLAG_WRITE);
				310	else
				311	ret = VM_FAULT_WRITE;
				312	put_page(page);
				313	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS)));
				314
				315	/* Which leaves us looping there if VM_FAULT_OOM: hmmm... */
				316	}
				317
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	318	static void break_cow(struct mm_struct *mm, unsigned long addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	319	{
				320	struct vm_area_struct *vma;
				321
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	322	down_read(&mm->mmap_sem);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	323	vma = find_vma(mm, addr);
				324	if (!vma \|\| vma->vm_start > addr)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	325	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	326	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	327	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	328	break_ksm(vma, addr);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	329	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	330	up_read(&mm->mmap_sem);
				331	}
				332
				333	static struct page get_mergeable_page(struct rmap_item rmap_item)
				334	{
				335	struct mm_struct *mm = rmap_item->mm;
				336	unsigned long addr = rmap_item->address;
				337	struct vm_area_struct *vma;
				338	struct page *page;
				339
				340	down_read(&mm->mmap_sem);
				341	vma = find_vma(mm, addr);
				342	if (!vma \|\| vma->vm_start > addr)
				343	goto out;
				344	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				345	goto out;
				346
				347	page = follow_page(vma, addr, FOLL_GET);
				348	if (!page)
				349	goto out;
				350	if (PageAnon(page)) {
				351	flush_anon_page(vma, page, addr);
				352	flush_dcache_page(page);
				353	} else {
				354	put_page(page);
				355	out: page = NULL;
				356	}
				357	up_read(&mm->mmap_sem);
				358	return page;
				359	}
				360
				361	/*
				362	* get_ksm_page: checks if the page at the virtual address in rmap_item
				363	* is still PageKsm, in which case we can trust the content of the page,
				364	* and it returns the gotten page; but NULL if the page has been zapped.
				365	*/
				366	static struct page get_ksm_page(struct rmap_item rmap_item)
				367	{
				368	struct page *page;
				369
				370	page = get_mergeable_page(rmap_item);
				371	if (page && !PageKsm(page)) {
				372	put_page(page);
				373	page = NULL;
				374	}
				375	return page;
				376	}
				377
				378	/*
				379	* Removing rmap_item from stable or unstable tree.
				380	* This function will clean the information from the stable/unstable tree.
				381	*/
				382	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				383	{
				384	if (in_stable_tree(rmap_item)) {
				385	struct rmap_item *next_item = rmap_item->next;
				386
				387	if (rmap_item->address & NODE_FLAG) {
				388	if (next_item) {
				389	rb_replace_node(&rmap_item->node,
				390	&next_item->node,
				391	&root_stable_tree);
				392	next_item->address \|= NODE_FLAG;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	393	ksm_pages_sharing--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	394	} else {
				395	rb_erase(&rmap_item->node, &root_stable_tree);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	396	ksm_pages_shared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	397	}
				398	} else {
				399	struct rmap_item *prev_item = rmap_item->prev;
				400
				401	BUG_ON(prev_item->next != rmap_item);
				402	prev_item->next = next_item;
				403	if (next_item) {
				404	BUG_ON(next_item->prev != rmap_item);
				405	next_item->prev = rmap_item->prev;
				406	}
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	407	ksm_pages_sharing--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	408	}
				409
				410	rmap_item->next = NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	411
				412	} else if (rmap_item->address & NODE_FLAG) {
				413	unsigned char age;
				414	/*
				415	* ksm_thread can and must skip the rb_erase, because
				416	* root_unstable_tree was already reset to RB_ROOT.
				417	* But __ksm_exit has to be careful: do the rb_erase
				418	* if it's interrupting a scan, and this rmap_item was
				419	* inserted by this scan rather than left from before.
				420	*
				421	* Because of the case in which remove_mm_from_lists
				422	* increments seqnr before removing rmaps, unstable_nr
				423	* may even be 2 behind seqnr, but should never be
				424	* further behind. Yes, I did have trouble with this!
				425	*/
				426	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
				427	BUG_ON(age > 2);
				428	if (!age)
				429	rb_erase(&rmap_item->node, &root_unstable_tree);
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	430	ksm_pages_unshared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	431	}
				432
				433	rmap_item->address &= PAGE_MASK;
				434
				435	cond_resched(); /* we're called from many long loops */
				436	}
				437
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	438	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
				439	struct list_head *cur)
				440	{
				441	struct rmap_item *rmap_item;
				442
				443	while (cur != &mm_slot->rmap_list) {
				444	rmap_item = list_entry(cur, struct rmap_item, link);
				445	cur = cur->next;
				446	remove_rmap_item_from_tree(rmap_item);
				447	list_del(&rmap_item->link);
				448	free_rmap_item(rmap_item);
				449	}
				450	}
				451
				452	/*
				453	* Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
				454	* than check every pte of a given vma, the locking doesn't quite work for
				455	* that - an rmap_item is assigned to the stable tree after inserting ksm
				456	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				457	* rmap_items from parent to child at fork time (so as not to waste time
				458	* if exit comes before the next scan reaches it).
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	459	*
				460	* Similarly, although we'd like to remove rmap_items (so updating counts
				461	* and freeing memory) when unmerging an area, it's easier to leave that
				462	* to the next pass of ksmd - consider, for example, how ksmd might be
				463	* in cmp_and_merge_page on one of the rmap_items we would be removing.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	464	*/
				465	static void unmerge_ksm_pages(struct vm_area_struct *vma,
				466	unsigned long start, unsigned long end)
				467	{
				468	unsigned long addr;
				469
				470	for (addr = start; addr < end; addr += PAGE_SIZE)
				471	break_ksm(vma, addr);
				472	}
				473
				474	static void unmerge_and_remove_all_rmap_items(void)
				475	{
				476	struct mm_slot *mm_slot;
				477	struct mm_struct *mm;
				478	struct vm_area_struct *vma;
				479
				480	list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) {
				481	mm = mm_slot->mm;
				482	down_read(&mm->mmap_sem);
				483	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				484	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				485	continue;
				486	unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
				487	}
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	488	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	489	up_read(&mm->mmap_sem);
				490	}
				491
				492	spin_lock(&ksm_mmlist_lock);
				493	if (ksm_scan.mm_slot != &ksm_mm_head) {
				494	ksm_scan.mm_slot = &ksm_mm_head;
				495	ksm_scan.seqnr++;
				496	}
				497	spin_unlock(&ksm_mmlist_lock);
				498	}
				499
				500	static void remove_mm_from_lists(struct mm_struct *mm)
				501	{
				502	struct mm_slot *mm_slot;
				503
				504	spin_lock(&ksm_mmlist_lock);
				505	mm_slot = get_mm_slot(mm);
				506
				507	/*
				508	* This mm_slot is always at the scanning cursor when we're
				509	* called from scan_get_next_rmap_item; but it's a special
				510	* case when we're called from __ksm_exit.
				511	*/
				512	if (ksm_scan.mm_slot == mm_slot) {
				513	ksm_scan.mm_slot = list_entry(
				514	mm_slot->mm_list.next, struct mm_slot, mm_list);
				515	ksm_scan.address = 0;
				516	ksm_scan.rmap_item = list_entry(
				517	&ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
				518	if (ksm_scan.mm_slot == &ksm_mm_head)
				519	ksm_scan.seqnr++;
				520	}
				521
				522	hlist_del(&mm_slot->link);
				523	list_del(&mm_slot->mm_list);
				524	spin_unlock(&ksm_mmlist_lock);
				525
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	526	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	527	free_mm_slot(mm_slot);
				528	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				529	}
				530
				531	static u32 calc_checksum(struct page *page)
				532	{
				533	u32 checksum;
				534	void *addr = kmap_atomic(page, KM_USER0);
				535	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
				536	kunmap_atomic(addr, KM_USER0);
				537	return checksum;
				538	}
				539
				540	static int memcmp_pages(struct page page1, struct page page2)
				541	{
				542	char addr1, addr2;
				543	int ret;
				544
				545	addr1 = kmap_atomic(page1, KM_USER0);
				546	addr2 = kmap_atomic(page2, KM_USER1);
				547	ret = memcmp(addr1, addr2, PAGE_SIZE);
				548	kunmap_atomic(addr2, KM_USER1);
				549	kunmap_atomic(addr1, KM_USER0);
				550	return ret;
				551	}
				552
				553	static inline int pages_identical(struct page page1, struct page page2)
				554	{
				555	return !memcmp_pages(page1, page2);
				556	}
				557
				558	static int write_protect_page(struct vm_area_struct vma, struct page page,
				559	pte_t *orig_pte)
				560	{
				561	struct mm_struct *mm = vma->vm_mm;
				562	unsigned long addr;
				563	pte_t *ptep;
				564	spinlock_t *ptl;
				565	int swapped;
				566	int err = -EFAULT;
				567
				568	addr = page_address_in_vma(page, vma);
				569	if (addr == -EFAULT)
				570	goto out;
				571
				572	ptep = page_check_address(page, mm, addr, &ptl, 0);
				573	if (!ptep)
				574	goto out;
				575
				576	if (pte_write(*ptep)) {
				577	pte_t entry;
				578
				579	swapped = PageSwapCache(page);
				580	flush_cache_page(vma, addr, page_to_pfn(page));
				581	/*
				582	* Ok this is tricky, when get_user_pages_fast() run it doesnt
				583	* take any lock, therefore the check that we are going to make
				584	* with the pagecount against the mapcount is racey and
				585	* O_DIRECT can happen right after the check.
				586	* So we clear the pte and flush the tlb before the check
				587	* this assure us that no O_DIRECT can happen after the check
				588	* or in the middle of the check.
				589	*/
				590	entry = ptep_clear_flush(vma, addr, ptep);
				591	/*
				592	* Check that no O_DIRECT or similar I/O is in progress on the
				593	* page
				594	*/
				595	if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
				596	set_pte_at_notify(mm, addr, ptep, entry);
				597	goto out_unlock;
				598	}
				599	entry = pte_wrprotect(entry);
				600	set_pte_at_notify(mm, addr, ptep, entry);
				601	}
				602	orig_pte = ptep;
				603	err = 0;
				604
				605	out_unlock:
				606	pte_unmap_unlock(ptep, ptl);
				607	out:
				608	return err;
				609	}
				610
				611	/**
				612	* replace_page - replace page in vma by new ksm page
				613	* @vma: vma that holds the pte pointing to oldpage
				614	* @oldpage: the page we are replacing by newpage
				615	* @newpage: the ksm page we replace oldpage by
				616	* @orig_pte: the original value of the pte
				617	*
				618	* Returns 0 on success, -EFAULT on failure.
				619	*/
				620	static int replace_page(struct vm_area_struct vma, struct page oldpage,
				621	struct page *newpage, pte_t orig_pte)
				622	{
				623	struct mm_struct *mm = vma->vm_mm;
				624	pgd_t *pgd;
				625	pud_t *pud;
				626	pmd_t *pmd;
				627	pte_t *ptep;
				628	spinlock_t *ptl;
				629	unsigned long addr;
				630	pgprot_t prot;
				631	int err = -EFAULT;
				632
				633	prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
				634
				635	addr = page_address_in_vma(oldpage, vma);
				636	if (addr == -EFAULT)
				637	goto out;
				638
				639	pgd = pgd_offset(mm, addr);
				640	if (!pgd_present(*pgd))
				641	goto out;
				642
				643	pud = pud_offset(pgd, addr);
				644	if (!pud_present(*pud))
				645	goto out;
				646
				647	pmd = pmd_offset(pud, addr);
				648	if (!pmd_present(*pmd))
				649	goto out;
				650
				651	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				652	if (!pte_same(*ptep, orig_pte)) {
				653	pte_unmap_unlock(ptep, ptl);
				654	goto out;
				655	}
				656
				657	get_page(newpage);
				658	page_add_ksm_rmap(newpage);
				659
				660	flush_cache_page(vma, addr, pte_pfn(*ptep));
				661	ptep_clear_flush(vma, addr, ptep);
				662	set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
				663
				664	page_remove_rmap(oldpage);
				665	put_page(oldpage);
				666
				667	pte_unmap_unlock(ptep, ptl);
				668	err = 0;
				669	out:
				670	return err;
				671	}
				672
				673	/*
				674	* try_to_merge_one_page - take two pages and merge them into one
				675	* @vma: the vma that hold the pte pointing into oldpage
				676	* @oldpage: the page that we want to replace with newpage
				677	* @newpage: the page that we want to map instead of oldpage
				678	*
				679	* Note:
				680	* oldpage should be a PageAnon page, while newpage should be a PageKsm page,
				681	* or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
				682	*
				683	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				684	*/
				685	static int try_to_merge_one_page(struct vm_area_struct *vma,
				686	struct page *oldpage,
				687	struct page *newpage)
				688	{
				689	pte_t orig_pte = __pte(0);
				690	int err = -EFAULT;
				691
				692	if (!(vma->vm_flags & VM_MERGEABLE))
				693	goto out;
				694
				695	if (!PageAnon(oldpage))
				696	goto out;
				697
				698	get_page(newpage);
				699	get_page(oldpage);
				700
				701	/*
				702	* We need the page lock to read a stable PageSwapCache in
				703	* write_protect_page(). We use trylock_page() instead of
				704	* lock_page() because we don't want to wait here - we
				705	* prefer to continue scanning and merging different pages,
				706	* then come back to this page when it is unlocked.
				707	*/
				708	if (!trylock_page(oldpage))
				709	goto out_putpage;
				710	/*
				711	* If this anonymous page is mapped only here, its pte may need
				712	* to be write-protected. If it's mapped elsewhere, all of its
				713	* ptes are necessarily already write-protected. But in either
				714	* case, we need to lock and check page_count is not raised.
				715	*/
				716	if (write_protect_page(vma, oldpage, &orig_pte)) {
				717	unlock_page(oldpage);
				718	goto out_putpage;
				719	}
				720	unlock_page(oldpage);
				721
				722	if (pages_identical(oldpage, newpage))
				723	err = replace_page(vma, oldpage, newpage, orig_pte);
				724
				725	out_putpage:
				726	put_page(oldpage);
				727	put_page(newpage);
				728	out:
				729	return err;
				730	}
				731
				732	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	733	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				734	* but no new kernel page is allocated: kpage must already be a ksm page.
				735	*/
				736	static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
				737	unsigned long addr1,
				738	struct page *page1,
				739	struct page *kpage)
				740	{
				741	struct vm_area_struct *vma;
				742	int err = -EFAULT;
				743
				744	down_read(&mm1->mmap_sem);
				745	vma = find_vma(mm1, addr1);
				746	if (!vma \|\| vma->vm_start > addr1)
				747	goto out;
				748
				749	err = try_to_merge_one_page(vma, page1, kpage);
				750	out:
				751	up_read(&mm1->mmap_sem);
				752	return err;
				753	}
				754
				755	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	756	* try_to_merge_two_pages - take two identical pages and prepare them
				757	* to be merged into one page.
				758	*
				759	* This function returns 0 if we successfully mapped two identical pages
				760	* into one page, -EFAULT otherwise.
				761	*
				762	* Note that this function allocates a new kernel page: if one of the pages
				763	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				764	*/
				765	static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
				766	struct page page1, struct mm_struct mm2,
				767	unsigned long addr2, struct page *page2)
				768	{
				769	struct vm_area_struct *vma;
				770	struct page *kpage;
				771	int err = -EFAULT;
				772
				773	/*
				774	* The number of nodes in the stable tree
				775	* is the number of kernel pages that we hold.
				776	*/
				777	if (ksm_max_kernel_pages &&
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	778	ksm_max_kernel_pages <= ksm_pages_shared)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	779	return err;
				780
				781	kpage = alloc_page(GFP_HIGHUSER);
				782	if (!kpage)
				783	return err;
				784
				785	down_read(&mm1->mmap_sem);
				786	vma = find_vma(mm1, addr1);
				787	if (!vma \|\| vma->vm_start > addr1) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	788	up_read(&mm1->mmap_sem);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	789	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	790	}
				791
				792	copy_user_highpage(kpage, page1, addr1, vma);
				793	err = try_to_merge_one_page(vma, page1, kpage);
				794	up_read(&mm1->mmap_sem);
				795
				796	if (!err) {
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	797	err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	798	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	799	* If that fails, we have a ksm page with only one pte
				800	* pointing to it: so break it.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	801	*/
				802	if (err)
				803	break_cow(mm1, addr1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	804	}
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	805	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	806	put_page(kpage);
				807	return err;
				808	}
				809
				810	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	811	* stable_tree_search - search page inside the stable tree
				812	* @page: the page that we are searching identical pages to.
				813	* @page2: pointer into identical page that we are holding inside the stable
				814	* tree that we have found.
				815	* @rmap_item: the reverse mapping item
				816	*
				817	* This function checks if there is a page inside the stable tree
				818	* with identical content to the page that we are scanning right now.
				819	*
				820	* This function return rmap_item pointer to the identical item if found,
				821	* NULL otherwise.
				822	*/
				823	static struct rmap_item stable_tree_search(struct page page,
				824	struct page **page2,
				825	struct rmap_item *rmap_item)
				826	{
				827	struct rb_node *node = root_stable_tree.rb_node;
				828
				829	while (node) {
				830	struct rmap_item tree_rmap_item, next_rmap_item;
				831	int ret;
				832
				833	tree_rmap_item = rb_entry(node, struct rmap_item, node);
				834	while (tree_rmap_item) {
				835	BUG_ON(!in_stable_tree(tree_rmap_item));
				836	cond_resched();
				837	page2[0] = get_ksm_page(tree_rmap_item);
				838	if (page2[0])
				839	break;
				840	next_rmap_item = tree_rmap_item->next;
				841	remove_rmap_item_from_tree(tree_rmap_item);
				842	tree_rmap_item = next_rmap_item;
				843	}
				844	if (!tree_rmap_item)
				845	return NULL;
				846
				847	ret = memcmp_pages(page, page2[0]);
				848
				849	if (ret < 0) {
				850	put_page(page2[0]);
				851	node = node->rb_left;
				852	} else if (ret > 0) {
				853	put_page(page2[0]);
				854	node = node->rb_right;
				855	} else {
				856	return tree_rmap_item;
				857	}
				858	}
				859
				860	return NULL;
				861	}
				862
				863	/*
				864	* stable_tree_insert - insert rmap_item pointing to new ksm page
				865	* into the stable tree.
				866	*
				867	* @page: the page that we are searching identical page to inside the stable
				868	* tree.
				869	* @rmap_item: pointer to the reverse mapping item.
				870	*
				871	* This function returns rmap_item if success, NULL otherwise.
				872	*/
				873	static struct rmap_item stable_tree_insert(struct page page,
				874	struct rmap_item *rmap_item)
				875	{
				876	struct rb_node **new = &root_stable_tree.rb_node;
				877	struct rb_node *parent = NULL;
				878
				879	while (*new) {
				880	struct rmap_item tree_rmap_item, next_rmap_item;
				881	struct page *tree_page;
				882	int ret;
				883
				884	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				885	while (tree_rmap_item) {
				886	BUG_ON(!in_stable_tree(tree_rmap_item));
				887	cond_resched();
				888	tree_page = get_ksm_page(tree_rmap_item);
				889	if (tree_page)
				890	break;
				891	next_rmap_item = tree_rmap_item->next;
				892	remove_rmap_item_from_tree(tree_rmap_item);
				893	tree_rmap_item = next_rmap_item;
				894	}
				895	if (!tree_rmap_item)
				896	return NULL;
				897
				898	ret = memcmp_pages(page, tree_page);
				899	put_page(tree_page);
				900
				901	parent = *new;
				902	if (ret < 0)
				903	new = &parent->rb_left;
				904	else if (ret > 0)
				905	new = &parent->rb_right;
				906	else {
				907	/*
				908	* It is not a bug that stable_tree_search() didn't
				909	* find this node: because at that time our page was
				910	* not yet write-protected, so may have changed since.
				911	*/
				912	return NULL;
				913	}
				914	}
				915
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	916	rmap_item->address \|= NODE_FLAG \| STABLE_FLAG;
				917	rmap_item->next = NULL;
				918	rb_link_node(&rmap_item->node, parent, new);
				919	rb_insert_color(&rmap_item->node, &root_stable_tree);
				920
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	921	ksm_pages_shared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	922	return rmap_item;
				923	}
				924
				925	/*
				926	* unstable_tree_search_insert - search and insert items into the unstable tree.
				927	*
				928	* @page: the page that we are going to search for identical page or to insert
				929	* into the unstable tree
				930	* @page2: pointer into identical page that was found inside the unstable tree
				931	* @rmap_item: the reverse mapping item of page
				932	*
				933	* This function searches for a page in the unstable tree identical to the
				934	* page currently being scanned; and if no identical page is found in the
				935	* tree, we insert rmap_item as a new object into the unstable tree.
				936	*
				937	* This function returns pointer to rmap_item found to be identical
				938	* to the currently scanned page, NULL otherwise.
				939	*
				940	* This function does both searching and inserting, because they share
				941	* the same walking algorithm in an rbtree.
				942	*/
				943	static struct rmap_item unstable_tree_search_insert(struct page page,
				944	struct page **page2,
				945	struct rmap_item *rmap_item)
				946	{
				947	struct rb_node **new = &root_unstable_tree.rb_node;
				948	struct rb_node *parent = NULL;
				949
				950	while (*new) {
				951	struct rmap_item *tree_rmap_item;
				952	int ret;
				953
				954	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				955	page2[0] = get_mergeable_page(tree_rmap_item);
				956	if (!page2[0])
				957	return NULL;
				958
				959	/*
				960	* Don't substitute an unswappable ksm page
				961	* just for one good swappable forked page.
				962	*/
				963	if (page == page2[0]) {
				964	put_page(page2[0]);
				965	return NULL;
				966	}
				967
				968	ret = memcmp_pages(page, page2[0]);
				969
				970	parent = *new;
				971	if (ret < 0) {
				972	put_page(page2[0]);
				973	new = &parent->rb_left;
				974	} else if (ret > 0) {
				975	put_page(page2[0]);
				976	new = &parent->rb_right;
				977	} else {
				978	return tree_rmap_item;
				979	}
				980	}
				981
				982	rmap_item->address \|= NODE_FLAG;
				983	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
				984	rb_link_node(&rmap_item->node, parent, new);
				985	rb_insert_color(&rmap_item->node, &root_unstable_tree);
				986
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	987	ksm_pages_unshared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	988	return NULL;
				989	}
				990
				991	/*
				992	* stable_tree_append - add another rmap_item to the linked list of
				993	* rmap_items hanging off a given node of the stable tree, all sharing
				994	* the same ksm page.
				995	*/
				996	static void stable_tree_append(struct rmap_item *rmap_item,
				997	struct rmap_item *tree_rmap_item)
				998	{
				999	rmap_item->next = tree_rmap_item->next;
				1000	rmap_item->prev = tree_rmap_item;
				1001
				1002	if (tree_rmap_item->next)
				1003	tree_rmap_item->next->prev = rmap_item;
				1004
				1005	tree_rmap_item->next = rmap_item;
				1006	rmap_item->address \|= STABLE_FLAG;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1007
				1008	ksm_pages_sharing++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1009	}
				1010
				1011	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame^]	1012	* cmp_and_merge_page - first see if page can be merged into the stable tree;
				1013	* if not, compare checksum to previous and if it's the same, see if page can
				1014	* be inserted into the unstable tree, or merged with a page already there and
				1015	* both transferred to the stable tree.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1016	*
				1017	* @page: the page that we are searching identical page to.
				1018	* @rmap_item: the reverse mapping into the virtual address of this page
				1019	*/
				1020	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				1021	{
				1022	struct page *page2[1];
				1023	struct rmap_item *tree_rmap_item;
				1024	unsigned int checksum;
				1025	int err;
				1026
				1027	if (in_stable_tree(rmap_item))
				1028	remove_rmap_item_from_tree(rmap_item);
				1029
				1030	/* We first start with searching the page inside the stable tree */
				1031	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
				1032	if (tree_rmap_item) {
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1033	if (page == page2[0]) /* forked */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1034	err = 0;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1035	else
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1036	err = try_to_merge_with_ksm_page(rmap_item->mm,
				1037	rmap_item->address,
				1038	page, page2[0]);
				1039	put_page(page2[0]);
				1040
				1041	if (!err) {
				1042	/*
				1043	* The page was successfully merged:
				1044	* add its rmap_item to the stable tree.
				1045	*/
				1046	stable_tree_append(rmap_item, tree_rmap_item);
				1047	}
				1048	return;
				1049	}
				1050
				1051	/*
				1052	* A ksm page might have got here by fork, but its other
				1053	* references have already been removed from the stable tree.
				1054	*/
				1055	if (PageKsm(page))
				1056	break_cow(rmap_item->mm, rmap_item->address);
				1057
				1058	/*
				1059	* In case the hash value of the page was changed from the last time we
				1060	* have calculated it, this page to be changed frequely, therefore we
				1061	* don't want to insert it to the unstable tree, and we don't want to
				1062	* waste our time to search if there is something identical to it there.
				1063	*/
				1064	checksum = calc_checksum(page);
				1065	if (rmap_item->oldchecksum != checksum) {
				1066	rmap_item->oldchecksum = checksum;
				1067	return;
				1068	}
				1069
				1070	tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
				1071	if (tree_rmap_item) {
				1072	err = try_to_merge_two_pages(rmap_item->mm,
				1073	rmap_item->address, page,
				1074	tree_rmap_item->mm,
				1075	tree_rmap_item->address, page2[0]);
				1076	/*
				1077	* As soon as we merge this page, we want to remove the
				1078	* rmap_item of the page we have merged with from the unstable
				1079	* tree, and insert it instead as new node in the stable tree.
				1080	*/
				1081	if (!err) {
				1082	rb_erase(&tree_rmap_item->node, &root_unstable_tree);
				1083	tree_rmap_item->address &= ~NODE_FLAG;
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1084	ksm_pages_unshared--;
				1085
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1086	/*
				1087	* If we fail to insert the page into the stable tree,
				1088	* we will have 2 virtual addresses that are pointing
				1089	* to a ksm page left outside the stable tree,
				1090	* in which case we need to break_cow on both.
				1091	*/
				1092	if (stable_tree_insert(page2[0], tree_rmap_item))
				1093	stable_tree_append(rmap_item, tree_rmap_item);
				1094	else {
				1095	break_cow(tree_rmap_item->mm,
				1096	tree_rmap_item->address);
				1097	break_cow(rmap_item->mm, rmap_item->address);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1098	}
				1099	}
				1100
				1101	put_page(page2[0]);
				1102	}
				1103	}
				1104
				1105	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
				1106	struct list_head *cur,
				1107	unsigned long addr)
				1108	{
				1109	struct rmap_item *rmap_item;
				1110
				1111	while (cur != &mm_slot->rmap_list) {
				1112	rmap_item = list_entry(cur, struct rmap_item, link);
				1113	if ((rmap_item->address & PAGE_MASK) == addr) {
				1114	if (!in_stable_tree(rmap_item))
				1115	remove_rmap_item_from_tree(rmap_item);
				1116	return rmap_item;
				1117	}
				1118	if (rmap_item->address > addr)
				1119	break;
				1120	cur = cur->next;
				1121	remove_rmap_item_from_tree(rmap_item);
				1122	list_del(&rmap_item->link);
				1123	free_rmap_item(rmap_item);
				1124	}
				1125
				1126	rmap_item = alloc_rmap_item();
				1127	if (rmap_item) {
				1128	/* It has already been zeroed */
				1129	rmap_item->mm = mm_slot->mm;
				1130	rmap_item->address = addr;
				1131	list_add_tail(&rmap_item->link, cur);
				1132	}
				1133	return rmap_item;
				1134	}
				1135
				1136	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				1137	{
				1138	struct mm_struct *mm;
				1139	struct mm_slot *slot;
				1140	struct vm_area_struct *vma;
				1141	struct rmap_item *rmap_item;
				1142
				1143	if (list_empty(&ksm_mm_head.mm_list))
				1144	return NULL;
				1145
				1146	slot = ksm_scan.mm_slot;
				1147	if (slot == &ksm_mm_head) {
				1148	root_unstable_tree = RB_ROOT;
				1149
				1150	spin_lock(&ksm_mmlist_lock);
				1151	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1152	ksm_scan.mm_slot = slot;
				1153	spin_unlock(&ksm_mmlist_lock);
				1154	next_mm:
				1155	ksm_scan.address = 0;
				1156	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1157	struct rmap_item, link);
				1158	}
				1159
				1160	mm = slot->mm;
				1161	down_read(&mm->mmap_sem);
				1162	for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {
				1163	if (!(vma->vm_flags & VM_MERGEABLE))
				1164	continue;
				1165	if (ksm_scan.address < vma->vm_start)
				1166	ksm_scan.address = vma->vm_start;
				1167	if (!vma->anon_vma)
				1168	ksm_scan.address = vma->vm_end;
				1169
				1170	while (ksm_scan.address < vma->vm_end) {
				1171	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
				1172	if (page && PageAnon(page)) {
				1173	flush_anon_page(vma, *page, ksm_scan.address);
				1174	flush_dcache_page(*page);
				1175	rmap_item = get_next_rmap_item(slot,
				1176	ksm_scan.rmap_item->link.next,
				1177	ksm_scan.address);
				1178	if (rmap_item) {
				1179	ksm_scan.rmap_item = rmap_item;
				1180	ksm_scan.address += PAGE_SIZE;
				1181	} else
				1182	put_page(*page);
				1183	up_read(&mm->mmap_sem);
				1184	return rmap_item;
				1185	}
				1186	if (*page)
				1187	put_page(*page);
				1188	ksm_scan.address += PAGE_SIZE;
				1189	cond_resched();
				1190	}
				1191	}
				1192
				1193	if (!ksm_scan.address) {
				1194	/*
				1195	* We've completed a full scan of all vmas, holding mmap_sem
				1196	* throughout, and found no VM_MERGEABLE: so do the same as
				1197	* __ksm_exit does to remove this mm from all our lists now.
				1198	*/
				1199	remove_mm_from_lists(mm);
				1200	up_read(&mm->mmap_sem);
				1201	slot = ksm_scan.mm_slot;
				1202	if (slot != &ksm_mm_head)
				1203	goto next_mm;
				1204	return NULL;
				1205	}
				1206
				1207	/*
				1208	* Nuke all the rmap_items that are above this current rmap:
				1209	* because there were no VM_MERGEABLE vmas with such addresses.
				1210	*/
				1211	remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
				1212	up_read(&mm->mmap_sem);
				1213
				1214	spin_lock(&ksm_mmlist_lock);
				1215	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1216	ksm_scan.mm_slot = slot;
				1217	spin_unlock(&ksm_mmlist_lock);
				1218
				1219	/* Repeat until we've completed scanning the whole list */
				1220	if (slot != &ksm_mm_head)
				1221	goto next_mm;
				1222
				1223	/*
				1224	* Bump seqnr here rather than at top, so that __ksm_exit
				1225	* can skip rb_erase on unstable tree until we run again.
				1226	*/
				1227	ksm_scan.seqnr++;
				1228	return NULL;
				1229	}
				1230
				1231	/**
				1232	* ksm_do_scan - the ksm scanner main worker function.
				1233	* @scan_npages - number of pages we want to scan before we return.
				1234	*/
				1235	static void ksm_do_scan(unsigned int scan_npages)
				1236	{
				1237	struct rmap_item *rmap_item;
				1238	struct page *page;
				1239
				1240	while (scan_npages--) {
				1241	cond_resched();
				1242	rmap_item = scan_get_next_rmap_item(&page);
				1243	if (!rmap_item)
				1244	return;
				1245	if (!PageKsm(page) \|\| !in_stable_tree(rmap_item))
				1246	cmp_and_merge_page(page, rmap_item);
Hugh Dickins	26465d3	2009-09-21 17:02:12 -0700	[diff] [blame]	1247	else if (page_mapcount(page) == 1) {
				1248	/*
				1249	* Replace now-unshared ksm page by ordinary page.
				1250	*/
				1251	break_cow(rmap_item->mm, rmap_item->address);
				1252	remove_rmap_item_from_tree(rmap_item);
				1253	rmap_item->oldchecksum = calc_checksum(page);
				1254	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1255	put_page(page);
				1256	}
				1257	}
				1258
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1259	static int ksmd_should_run(void)
				1260	{
				1261	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
				1262	}
				1263
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1264	static int ksm_scan_thread(void *nothing)
				1265	{
Izik Eidus	339aa62	2009-09-21 17:02:07 -0700	[diff] [blame]	1266	set_user_nice(current, 5);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1267
				1268	while (!kthread_should_stop()) {
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1269	mutex_lock(&ksm_thread_mutex);
				1270	if (ksmd_should_run())
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1271	ksm_do_scan(ksm_thread_pages_to_scan);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1272	mutex_unlock(&ksm_thread_mutex);
				1273
				1274	if (ksmd_should_run()) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1275	schedule_timeout_interruptible(
				1276	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				1277	} else {
				1278	wait_event_interruptible(ksm_thread_wait,
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1279	ksmd_should_run() \|\| kthread_should_stop());
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1280	}
				1281	}
				1282	return 0;
				1283	}
				1284
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1285	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				1286	unsigned long end, int advice, unsigned long *vm_flags)
				1287	{
				1288	struct mm_struct *mm = vma->vm_mm;
				1289
				1290	switch (advice) {
				1291	case MADV_MERGEABLE:
				1292	/*
				1293	* Be somewhat over-protective for now!
				1294	*/
				1295	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				1296	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
				1297	VM_RESERVED \| VM_HUGETLB \| VM_INSERTPAGE \|
				1298	VM_MIXEDMAP \| VM_SAO))
				1299	return 0; /* just ignore the advice */
				1300
				1301	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
				1302	if (__ksm_enter(mm) < 0)
				1303	return -EAGAIN;
				1304
				1305	*vm_flags \|= VM_MERGEABLE;
				1306	break;
				1307
				1308	case MADV_UNMERGEABLE:
				1309	if (!(*vm_flags & VM_MERGEABLE))
				1310	return 0; /* just ignore the advice */
				1311
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1312	if (vma->anon_vma)
				1313	unmerge_ksm_pages(vma, start, end);
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1314
				1315	*vm_flags &= ~VM_MERGEABLE;
				1316	break;
				1317	}
				1318
				1319	return 0;
				1320	}
				1321
				1322	int __ksm_enter(struct mm_struct *mm)
				1323	{
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1324	struct mm_slot *mm_slot;
				1325	int needs_wakeup;
				1326
				1327	mm_slot = alloc_mm_slot();
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1328	if (!mm_slot)
				1329	return -ENOMEM;
				1330
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1331	/* Check ksm_run too? Would need tighter locking */
				1332	needs_wakeup = list_empty(&ksm_mm_head.mm_list);
				1333
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1334	spin_lock(&ksm_mmlist_lock);
				1335	insert_to_mm_slots_hash(mm, mm_slot);
				1336	/*
				1337	* Insert just behind the scanning cursor, to let the area settle
				1338	* down a little; when fork is followed by immediate exec, we don't
				1339	* want ksmd to waste time setting up and tearing down an rmap_list.
				1340	*/
				1341	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
				1342	spin_unlock(&ksm_mmlist_lock);
				1343
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1344	set_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1345
				1346	if (needs_wakeup)
				1347	wake_up_interruptible(&ksm_thread_wait);
				1348
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1349	return 0;
				1350	}
				1351
				1352	void __ksm_exit(struct mm_struct *mm)
				1353	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1354	/*
				1355	* This process is exiting: doesn't hold and doesn't need mmap_sem;
				1356	* but we do need to exclude ksmd and other exiters while we modify
				1357	* the various lists and trees.
				1358	*/
				1359	mutex_lock(&ksm_thread_mutex);
				1360	remove_mm_from_lists(mm);
				1361	mutex_unlock(&ksm_thread_mutex);
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1362	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1363
				1364	#define KSM_ATTR_RO(_name) \
				1365	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				1366	#define KSM_ATTR(_name) \
				1367	static struct kobj_attribute _name##_attr = \
				1368	__ATTR(_name, 0644, _name##_show, _name##_store)
				1369
				1370	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				1371	struct kobj_attribute attr, char buf)
				1372	{
				1373	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				1374	}
				1375
				1376	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				1377	struct kobj_attribute *attr,
				1378	const char *buf, size_t count)
				1379	{
				1380	unsigned long msecs;
				1381	int err;
				1382
				1383	err = strict_strtoul(buf, 10, &msecs);
				1384	if (err \|\| msecs > UINT_MAX)
				1385	return -EINVAL;
				1386
				1387	ksm_thread_sleep_millisecs = msecs;
				1388
				1389	return count;
				1390	}
				1391	KSM_ATTR(sleep_millisecs);
				1392
				1393	static ssize_t pages_to_scan_show(struct kobject *kobj,
				1394	struct kobj_attribute attr, char buf)
				1395	{
				1396	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				1397	}
				1398
				1399	static ssize_t pages_to_scan_store(struct kobject *kobj,
				1400	struct kobj_attribute *attr,
				1401	const char *buf, size_t count)
				1402	{
				1403	int err;
				1404	unsigned long nr_pages;
				1405
				1406	err = strict_strtoul(buf, 10, &nr_pages);
				1407	if (err \|\| nr_pages > UINT_MAX)
				1408	return -EINVAL;
				1409
				1410	ksm_thread_pages_to_scan = nr_pages;
				1411
				1412	return count;
				1413	}
				1414	KSM_ATTR(pages_to_scan);
				1415
				1416	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				1417	char *buf)
				1418	{
				1419	return sprintf(buf, "%u\n", ksm_run);
				1420	}
				1421
				1422	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				1423	const char *buf, size_t count)
				1424	{
				1425	int err;
				1426	unsigned long flags;
				1427
				1428	err = strict_strtoul(buf, 10, &flags);
				1429	if (err \|\| flags > UINT_MAX)
				1430	return -EINVAL;
				1431	if (flags > KSM_RUN_UNMERGE)
				1432	return -EINVAL;
				1433
				1434	/*
				1435	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				1436	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1437	* breaking COW to free the unswappable pages_shared (but leaves
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1438	* mm_slots on the list for when ksmd may be set running again).
				1439	*/
				1440
				1441	mutex_lock(&ksm_thread_mutex);
				1442	if (ksm_run != flags) {
				1443	ksm_run = flags;
				1444	if (flags & KSM_RUN_UNMERGE)
				1445	unmerge_and_remove_all_rmap_items();
				1446	}
				1447	mutex_unlock(&ksm_thread_mutex);
				1448
				1449	if (flags & KSM_RUN_MERGE)
				1450	wake_up_interruptible(&ksm_thread_wait);
				1451
				1452	return count;
				1453	}
				1454	KSM_ATTR(run);
				1455
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1456	static ssize_t max_kernel_pages_store(struct kobject *kobj,
				1457	struct kobj_attribute *attr,
				1458	const char *buf, size_t count)
				1459	{
				1460	int err;
				1461	unsigned long nr_pages;
				1462
				1463	err = strict_strtoul(buf, 10, &nr_pages);
				1464	if (err)
				1465	return -EINVAL;
				1466
				1467	ksm_max_kernel_pages = nr_pages;
				1468
				1469	return count;
				1470	}
				1471
				1472	static ssize_t max_kernel_pages_show(struct kobject *kobj,
				1473	struct kobj_attribute attr, char buf)
				1474	{
				1475	return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
				1476	}
				1477	KSM_ATTR(max_kernel_pages);
				1478
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1479	static ssize_t pages_shared_show(struct kobject *kobj,
				1480	struct kobj_attribute attr, char buf)
				1481	{
				1482	return sprintf(buf, "%lu\n", ksm_pages_shared);
				1483	}
				1484	KSM_ATTR_RO(pages_shared);
				1485
				1486	static ssize_t pages_sharing_show(struct kobject *kobj,
				1487	struct kobj_attribute attr, char buf)
				1488	{
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1489	return sprintf(buf, "%lu\n", ksm_pages_sharing);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1490	}
				1491	KSM_ATTR_RO(pages_sharing);
				1492
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1493	static ssize_t pages_unshared_show(struct kobject *kobj,
				1494	struct kobj_attribute attr, char buf)
				1495	{
				1496	return sprintf(buf, "%lu\n", ksm_pages_unshared);
				1497	}
				1498	KSM_ATTR_RO(pages_unshared);
				1499
				1500	static ssize_t pages_volatile_show(struct kobject *kobj,
				1501	struct kobj_attribute attr, char buf)
				1502	{
				1503	long ksm_pages_volatile;
				1504
				1505	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
				1506	- ksm_pages_sharing - ksm_pages_unshared;
				1507	/*
				1508	* It was not worth any locking to calculate that statistic,
				1509	* but it might therefore sometimes be negative: conceal that.
				1510	*/
				1511	if (ksm_pages_volatile < 0)
				1512	ksm_pages_volatile = 0;
				1513	return sprintf(buf, "%ld\n", ksm_pages_volatile);
				1514	}
				1515	KSM_ATTR_RO(pages_volatile);
				1516
				1517	static ssize_t full_scans_show(struct kobject *kobj,
				1518	struct kobj_attribute attr, char buf)
				1519	{
				1520	return sprintf(buf, "%lu\n", ksm_scan.seqnr);
				1521	}
				1522	KSM_ATTR_RO(full_scans);
				1523
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1524	static struct attribute *ksm_attrs[] = {
				1525	&sleep_millisecs_attr.attr,
				1526	&pages_to_scan_attr.attr,
				1527	&run_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1528	&max_kernel_pages_attr.attr,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1529	&pages_shared_attr.attr,
				1530	&pages_sharing_attr.attr,
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1531	&pages_unshared_attr.attr,
				1532	&pages_volatile_attr.attr,
				1533	&full_scans_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1534	NULL,
				1535	};
				1536
				1537	static struct attribute_group ksm_attr_group = {
				1538	.attrs = ksm_attrs,
				1539	.name = "ksm",
				1540	};
				1541
				1542	static int __init ksm_init(void)
				1543	{
				1544	struct task_struct *ksm_thread;
				1545	int err;
				1546
				1547	err = ksm_slab_init();
				1548	if (err)
				1549	goto out;
				1550
				1551	err = mm_slots_hash_init();
				1552	if (err)
				1553	goto out_free1;
				1554
				1555	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				1556	if (IS_ERR(ksm_thread)) {
				1557	printk(KERN_ERR "ksm: creating kthread failed\n");
				1558	err = PTR_ERR(ksm_thread);
				1559	goto out_free2;
				1560	}
				1561
				1562	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				1563	if (err) {
				1564	printk(KERN_ERR "ksm: register sysfs failed\n");
				1565	goto out_free3;
				1566	}
				1567
				1568	return 0;
				1569
				1570	out_free3:
				1571	kthread_stop(ksm_thread);
				1572	out_free2:
				1573	mm_slots_hash_free();
				1574	out_free1:
				1575	ksm_slab_free();
				1576	out:
				1577	return err;
				1578	}
				1579	module_init(ksm_init)