Blame - mm/ksm.c - kernel/msm-4.9

blob: e11e7a5ac84f6a9076072077c2f622acea71245a [file] [log] [blame]

Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	7	* Copyright (C) 2008-2009 Red Hat, Inc.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	12	* Hugh Dickins
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2.
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	15	*/
				16
				17	#include <linux/errno.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	18	#include <linux/mm.h>
				19	#include <linux/fs.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	20	#include <linux/mman.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	21	#include <linux/sched.h>
				22	#include <linux/rwsem.h>
				23	#include <linux/pagemap.h>
				24	#include <linux/rmap.h>
				25	#include <linux/spinlock.h>
				26	#include <linux/jhash.h>
				27	#include <linux/delay.h>
				28	#include <linux/kthread.h>
				29	#include <linux/wait.h>
				30	#include <linux/slab.h>
				31	#include <linux/rbtree.h>
				32	#include <linux/mmu_notifier.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	33	#include <linux/ksm.h>
				34
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	35	#include <asm/tlbflush.h>
				36
				37	/*
				38	* A few notes about the KSM scanning process,
				39	* to make it easier to understand the data structures below:
				40	*
				41	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				42	* contents into a data structure that holds pointers to the pages' locations.
				43	*
				44	* Since the contents of the pages may change at any moment, KSM cannot just
				45	* insert the pages into a normal sorted tree and expect it to find anything.
				46	* Therefore KSM uses two data structures - the stable and the unstable tree.
				47	*
				48	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				49	* by their contents. Because each such page is write-protected, searching on
				50	* this tree is fully assured to be working (except when pages are unmapped),
				51	* and therefore this tree is called the stable tree.
				52	*
				53	* In addition to the stable tree, KSM uses a second data structure called the
				54	* unstable tree: this tree holds pointers to pages which have been found to
				55	* be "unchanged for a period of time". The unstable tree sorts these pages
				56	* by their contents, but since they are not write-protected, KSM cannot rely
				57	* upon the unstable tree to work correctly - the unstable tree is liable to
				58	* be corrupted as its contents are modified, and so it is called unstable.
				59	*
				60	* KSM solves this problem by several techniques:
				61	*
				62	* 1) The unstable tree is flushed every time KSM completes scanning all
				63	* memory areas, and then the tree is rebuilt again from the beginning.
				64	* 2) KSM will only insert into the unstable tree, pages whose hash value
				65	* has not changed since the previous scan of all memory areas.
				66	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				67	* colors of the nodes and not on their contents, assuring that even when
				68	* the tree gets "corrupted" it won't get out of balance, so scanning time
				69	* remains the same (also, searching and inserting nodes in an rbtree uses
				70	* the same algorithm, so we have no overhead when we flush and rebuild).
				71	* 4) KSM never flushes the stable tree, which means that even if it were to
				72	* take 10 attempts to find a page in the unstable tree, once it is found,
				73	* it is secured in the stable tree. (When we scan a new page, we first
				74	* compare it against the stable tree, and then against the unstable tree.)
				75	*/
				76
				77	/**
				78	* struct mm_slot - ksm information per mm that is being scanned
				79	* @link: link to the mm_slots hash list
				80	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
				81	* @rmap_list: head for this mm_slot's list of rmap_items
				82	* @mm: the mm that this information is valid for
				83	*/
				84	struct mm_slot {
				85	struct hlist_node link;
				86	struct list_head mm_list;
				87	struct list_head rmap_list;
				88	struct mm_struct *mm;
				89	};
				90
				91	/**
				92	* struct ksm_scan - cursor for scanning
				93	* @mm_slot: the current mm_slot we are scanning
				94	* @address: the next address inside that to be scanned
				95	* @rmap_item: the current rmap that we are scanning inside the rmap_list
				96	* @seqnr: count of completed full scans (needed when removing unstable node)
				97	*
				98	* There is only the one ksm_scan instance of this cursor structure.
				99	*/
				100	struct ksm_scan {
				101	struct mm_slot *mm_slot;
				102	unsigned long address;
				103	struct rmap_item *rmap_item;
				104	unsigned long seqnr;
				105	};
				106
				107	/**
				108	* struct rmap_item - reverse mapping item for virtual addresses
				109	* @link: link into mm_slot's rmap_list (rmap_list is per mm)
				110	* @mm: the memory structure this rmap_item is pointing into
				111	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				112	* @oldchecksum: previous checksum of the page at that virtual address
				113	* @node: rb_node of this rmap_item in either unstable or stable tree
				114	* @next: next rmap_item hanging off the same node of the stable tree
				115	* @prev: previous rmap_item hanging off the same node of the stable tree
				116	*/
				117	struct rmap_item {
				118	struct list_head link;
				119	struct mm_struct *mm;
				120	unsigned long address; /* + low bits used for flags below */
				121	union {
				122	unsigned int oldchecksum; /* when unstable */
				123	struct rmap_item next; / when stable */
				124	};
				125	union {
				126	struct rb_node node; /* when tree node */
				127	struct rmap_item prev; / in stable list */
				128	};
				129	};
				130
				131	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
				132	#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
				133	#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
				134
				135	/* The stable and unstable tree heads */
				136	static struct rb_root root_stable_tree = RB_ROOT;
				137	static struct rb_root root_unstable_tree = RB_ROOT;
				138
				139	#define MM_SLOTS_HASH_HEADS 1024
				140	static struct hlist_head *mm_slots_hash;
				141
				142	static struct mm_slot ksm_mm_head = {
				143	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				144	};
				145	static struct ksm_scan ksm_scan = {
				146	.mm_slot = &ksm_mm_head,
				147	};
				148
				149	static struct kmem_cache *rmap_item_cache;
				150	static struct kmem_cache *mm_slot_cache;
				151
				152	/* The number of nodes in the stable tree */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	153	static unsigned long ksm_pages_shared;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	154
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	155	/* The number of page slots additionally sharing those nodes */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	156	static unsigned long ksm_pages_sharing;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	157
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	158	/* The number of nodes in the unstable tree */
				159	static unsigned long ksm_pages_unshared;
				160
				161	/* The number of rmap_items in use: to calculate pages_volatile */
				162	static unsigned long ksm_rmap_items;
				163
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	164	/* Limit on the number of unswappable pages used */
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	165	static unsigned long ksm_max_kernel_pages = 2000;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	166
				167	/* Number of pages ksmd should scan in one batch */
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	168	static unsigned int ksm_thread_pages_to_scan = 200;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	169
				170	/* Milliseconds ksmd should sleep between batches */
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	171	static unsigned int ksm_thread_sleep_millisecs = 20;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	172
				173	#define KSM_RUN_STOP 0
				174	#define KSM_RUN_MERGE 1
				175	#define KSM_RUN_UNMERGE 2
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	176	static unsigned int ksm_run = KSM_RUN_MERGE;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	177
				178	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				179	static DEFINE_MUTEX(ksm_thread_mutex);
				180	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				181
				182	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				183	sizeof(struct __struct), __alignof__(struct __struct),\
				184	(__flags), NULL)
				185
				186	static int __init ksm_slab_init(void)
				187	{
				188	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				189	if (!rmap_item_cache)
				190	goto out;
				191
				192	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				193	if (!mm_slot_cache)
				194	goto out_free;
				195
				196	return 0;
				197
				198	out_free:
				199	kmem_cache_destroy(rmap_item_cache);
				200	out:
				201	return -ENOMEM;
				202	}
				203
				204	static void __init ksm_slab_free(void)
				205	{
				206	kmem_cache_destroy(mm_slot_cache);
				207	kmem_cache_destroy(rmap_item_cache);
				208	mm_slot_cache = NULL;
				209	}
				210
				211	static inline struct rmap_item *alloc_rmap_item(void)
				212	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	213	struct rmap_item *rmap_item;
				214
				215	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
				216	if (rmap_item)
				217	ksm_rmap_items++;
				218	return rmap_item;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	219	}
				220
				221	static inline void free_rmap_item(struct rmap_item *rmap_item)
				222	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	223	ksm_rmap_items--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	224	rmap_item->mm = NULL; /* debug safety */
				225	kmem_cache_free(rmap_item_cache, rmap_item);
				226	}
				227
				228	static inline struct mm_slot *alloc_mm_slot(void)
				229	{
				230	if (!mm_slot_cache) /* initialization failed */
				231	return NULL;
				232	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				233	}
				234
				235	static inline void free_mm_slot(struct mm_slot *mm_slot)
				236	{
				237	kmem_cache_free(mm_slot_cache, mm_slot);
				238	}
				239
				240	static int __init mm_slots_hash_init(void)
				241	{
				242	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
				243	GFP_KERNEL);
				244	if (!mm_slots_hash)
				245	return -ENOMEM;
				246	return 0;
				247	}
				248
				249	static void __init mm_slots_hash_free(void)
				250	{
				251	kfree(mm_slots_hash);
				252	}
				253
				254	static struct mm_slot get_mm_slot(struct mm_struct mm)
				255	{
				256	struct mm_slot *mm_slot;
				257	struct hlist_head *bucket;
				258	struct hlist_node *node;
				259
				260	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				261	% MM_SLOTS_HASH_HEADS];
				262	hlist_for_each_entry(mm_slot, node, bucket, link) {
				263	if (mm == mm_slot->mm)
				264	return mm_slot;
				265	}
				266	return NULL;
				267	}
				268
				269	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				270	struct mm_slot *mm_slot)
				271	{
				272	struct hlist_head *bucket;
				273
				274	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				275	% MM_SLOTS_HASH_HEADS];
				276	mm_slot->mm = mm;
				277	INIT_LIST_HEAD(&mm_slot->rmap_list);
				278	hlist_add_head(&mm_slot->link, bucket);
				279	}
				280
				281	static inline int in_stable_tree(struct rmap_item *rmap_item)
				282	{
				283	return rmap_item->address & STABLE_FLAG;
				284	}
				285
				286	/*
Hugh Dickins	a913e18	2009-09-21 17:02:26 -0700	[diff] [blame^]	287	* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
				288	* page tables after it has passed through ksm_exit() - which, if necessary,
				289	* takes mmap_sem briefly to serialize against them. ksm_exit() does not set
				290	* a special flag: they can just back out as soon as mm_users goes to zero.
				291	* ksm_test_exit() is used throughout to make this test for exit: in some
				292	* places for correctness, in some places just to avoid unnecessary work.
				293	*/
				294	static inline bool ksm_test_exit(struct mm_struct *mm)
				295	{
				296	return atomic_read(&mm->mm_users) == 0;
				297	}
				298
				299	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	300	* We use break_ksm to break COW on a ksm page: it's a stripped down
				301	*
				302	* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
				303	* put_page(page);
				304	*
				305	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				306	* in case the application has unmapped and remapped mm,addr meanwhile.
				307	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				308	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
				309	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	310	static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	311	{
				312	struct page *page;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	313	int ret = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	314
				315	do {
				316	cond_resched();
				317	page = follow_page(vma, addr, FOLL_GET);
				318	if (!page)
				319	break;
				320	if (PageKsm(page))
				321	ret = handle_mm_fault(vma->vm_mm, vma, addr,
				322	FAULT_FLAG_WRITE);
				323	else
				324	ret = VM_FAULT_WRITE;
				325	put_page(page);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	326	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS \| VM_FAULT_OOM)));
				327	/*
				328	* We must loop because handle_mm_fault() may back out if there's
				329	* any difficulty e.g. if pte accessed bit gets updated concurrently.
				330	*
				331	* VM_FAULT_WRITE is what we have been hoping for: it indicates that
				332	* COW has been broken, even if the vma does not permit VM_WRITE;
				333	* but note that a concurrent fault might break PageKsm for us.
				334	*
				335	* VM_FAULT_SIGBUS could occur if we race with truncation of the
				336	* backing file, which also invalidates anonymous pages: that's
				337	* okay, that truncation will have unmapped the PageKsm for us.
				338	*
				339	* VM_FAULT_OOM: at the time of writing (late July 2009), setting
				340	* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
				341	* current task has TIF_MEMDIE set, and will be OOM killed on return
				342	* to user; and ksmd, having no mm, would never be chosen for that.
				343	*
				344	* But if the mm is in a limited mem_cgroup, then the fault may fail
				345	* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
				346	* even ksmd can fail in this way - though it's usually breaking ksm
				347	* just to undo a merge it made a moment before, so unlikely to oom.
				348	*
				349	* That's a pity: we might therefore have more kernel pages allocated
				350	* than we're counting as nodes in the stable tree; but ksm_do_scan
				351	* will retry to break_cow on each pass, so should recover the page
				352	* in due course. The important thing is to not let VM_MERGEABLE
				353	* be cleared while any such pages might remain in the area.
				354	*/
				355	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	356	}
				357
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	358	static void break_cow(struct mm_struct *mm, unsigned long addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	359	{
				360	struct vm_area_struct *vma;
				361
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	362	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	363	if (ksm_test_exit(mm))
				364	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	365	vma = find_vma(mm, addr);
				366	if (!vma \|\| vma->vm_start > addr)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	367	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	368	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	369	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	370	break_ksm(vma, addr);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	371	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	372	up_read(&mm->mmap_sem);
				373	}
				374
				375	static struct page get_mergeable_page(struct rmap_item rmap_item)
				376	{
				377	struct mm_struct *mm = rmap_item->mm;
				378	unsigned long addr = rmap_item->address;
				379	struct vm_area_struct *vma;
				380	struct page *page;
				381
				382	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	383	if (ksm_test_exit(mm))
				384	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	385	vma = find_vma(mm, addr);
				386	if (!vma \|\| vma->vm_start > addr)
				387	goto out;
				388	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				389	goto out;
				390
				391	page = follow_page(vma, addr, FOLL_GET);
				392	if (!page)
				393	goto out;
				394	if (PageAnon(page)) {
				395	flush_anon_page(vma, page, addr);
				396	flush_dcache_page(page);
				397	} else {
				398	put_page(page);
				399	out: page = NULL;
				400	}
				401	up_read(&mm->mmap_sem);
				402	return page;
				403	}
				404
				405	/*
				406	* get_ksm_page: checks if the page at the virtual address in rmap_item
				407	* is still PageKsm, in which case we can trust the content of the page,
				408	* and it returns the gotten page; but NULL if the page has been zapped.
				409	*/
				410	static struct page get_ksm_page(struct rmap_item rmap_item)
				411	{
				412	struct page *page;
				413
				414	page = get_mergeable_page(rmap_item);
				415	if (page && !PageKsm(page)) {
				416	put_page(page);
				417	page = NULL;
				418	}
				419	return page;
				420	}
				421
				422	/*
				423	* Removing rmap_item from stable or unstable tree.
				424	* This function will clean the information from the stable/unstable tree.
				425	*/
				426	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				427	{
				428	if (in_stable_tree(rmap_item)) {
				429	struct rmap_item *next_item = rmap_item->next;
				430
				431	if (rmap_item->address & NODE_FLAG) {
				432	if (next_item) {
				433	rb_replace_node(&rmap_item->node,
				434	&next_item->node,
				435	&root_stable_tree);
				436	next_item->address \|= NODE_FLAG;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	437	ksm_pages_sharing--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	438	} else {
				439	rb_erase(&rmap_item->node, &root_stable_tree);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	440	ksm_pages_shared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	441	}
				442	} else {
				443	struct rmap_item *prev_item = rmap_item->prev;
				444
				445	BUG_ON(prev_item->next != rmap_item);
				446	prev_item->next = next_item;
				447	if (next_item) {
				448	BUG_ON(next_item->prev != rmap_item);
				449	next_item->prev = rmap_item->prev;
				450	}
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	451	ksm_pages_sharing--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	452	}
				453
				454	rmap_item->next = NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	455
				456	} else if (rmap_item->address & NODE_FLAG) {
				457	unsigned char age;
				458	/*
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	459	* Usually ksmd can and must skip the rb_erase, because
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	460	* root_unstable_tree was already reset to RB_ROOT.
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	461	* But be careful when an mm is exiting: do the rb_erase
				462	* if this rmap_item was inserted by this scan, rather
				463	* than left over from before.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	464	*/
				465	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	466	BUG_ON(age > 1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	467	if (!age)
				468	rb_erase(&rmap_item->node, &root_unstable_tree);
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	469	ksm_pages_unshared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	470	}
				471
				472	rmap_item->address &= PAGE_MASK;
				473
				474	cond_resched(); /* we're called from many long loops */
				475	}
				476
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	477	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
				478	struct list_head *cur)
				479	{
				480	struct rmap_item *rmap_item;
				481
				482	while (cur != &mm_slot->rmap_list) {
				483	rmap_item = list_entry(cur, struct rmap_item, link);
				484	cur = cur->next;
				485	remove_rmap_item_from_tree(rmap_item);
				486	list_del(&rmap_item->link);
				487	free_rmap_item(rmap_item);
				488	}
				489	}
				490
				491	/*
				492	* Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
				493	* than check every pte of a given vma, the locking doesn't quite work for
				494	* that - an rmap_item is assigned to the stable tree after inserting ksm
				495	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				496	* rmap_items from parent to child at fork time (so as not to waste time
				497	* if exit comes before the next scan reaches it).
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	498	*
				499	* Similarly, although we'd like to remove rmap_items (so updating counts
				500	* and freeing memory) when unmerging an area, it's easier to leave that
				501	* to the next pass of ksmd - consider, for example, how ksmd might be
				502	* in cmp_and_merge_page on one of the rmap_items we would be removing.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	503	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	504	static int unmerge_ksm_pages(struct vm_area_struct *vma,
				505	unsigned long start, unsigned long end)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	506	{
				507	unsigned long addr;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	508	int err = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	509
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	510	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	511	if (ksm_test_exit(vma->vm_mm))
				512	break;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	513	if (signal_pending(current))
				514	err = -ERESTARTSYS;
				515	else
				516	err = break_ksm(vma, addr);
				517	}
				518	return err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	519	}
				520
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	521	#ifdef CONFIG_SYSFS
				522	/*
				523	* Only called through the sysfs control interface:
				524	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	525	static int unmerge_and_remove_all_rmap_items(void)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	526	{
				527	struct mm_slot *mm_slot;
				528	struct mm_struct *mm;
				529	struct vm_area_struct *vma;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	530	int err = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	531
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	532	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	533	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	534	struct mm_slot, mm_list);
				535	spin_unlock(&ksm_mmlist_lock);
				536
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	537	for (mm_slot = ksm_scan.mm_slot;
				538	mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	539	mm = mm_slot->mm;
				540	down_read(&mm->mmap_sem);
				541	for (vma = mm->mmap; vma; vma = vma->vm_next) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	542	if (ksm_test_exit(mm))
				543	break;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	544	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				545	continue;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	546	err = unmerge_ksm_pages(vma,
				547	vma->vm_start, vma->vm_end);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	548	if (err)
				549	goto error;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	550	}
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	551
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	552	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	553
				554	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	555	ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	556	struct mm_slot, mm_list);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	557	if (ksm_test_exit(mm)) {
				558	hlist_del(&mm_slot->link);
				559	list_del(&mm_slot->mm_list);
				560	spin_unlock(&ksm_mmlist_lock);
				561
				562	free_mm_slot(mm_slot);
				563	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				564	up_read(&mm->mmap_sem);
				565	mmdrop(mm);
				566	} else {
				567	spin_unlock(&ksm_mmlist_lock);
				568	up_read(&mm->mmap_sem);
				569	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	570	}
				571
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	572	ksm_scan.seqnr = 0;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	573	return 0;
				574
				575	error:
				576	up_read(&mm->mmap_sem);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	577	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	578	ksm_scan.mm_slot = &ksm_mm_head;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	579	spin_unlock(&ksm_mmlist_lock);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	580	return err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	581	}
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	582	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	583
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	584	static u32 calc_checksum(struct page *page)
				585	{
				586	u32 checksum;
				587	void *addr = kmap_atomic(page, KM_USER0);
				588	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
				589	kunmap_atomic(addr, KM_USER0);
				590	return checksum;
				591	}
				592
				593	static int memcmp_pages(struct page page1, struct page page2)
				594	{
				595	char addr1, addr2;
				596	int ret;
				597
				598	addr1 = kmap_atomic(page1, KM_USER0);
				599	addr2 = kmap_atomic(page2, KM_USER1);
				600	ret = memcmp(addr1, addr2, PAGE_SIZE);
				601	kunmap_atomic(addr2, KM_USER1);
				602	kunmap_atomic(addr1, KM_USER0);
				603	return ret;
				604	}
				605
				606	static inline int pages_identical(struct page page1, struct page page2)
				607	{
				608	return !memcmp_pages(page1, page2);
				609	}
				610
				611	static int write_protect_page(struct vm_area_struct vma, struct page page,
				612	pte_t *orig_pte)
				613	{
				614	struct mm_struct *mm = vma->vm_mm;
				615	unsigned long addr;
				616	pte_t *ptep;
				617	spinlock_t *ptl;
				618	int swapped;
				619	int err = -EFAULT;
				620
				621	addr = page_address_in_vma(page, vma);
				622	if (addr == -EFAULT)
				623	goto out;
				624
				625	ptep = page_check_address(page, mm, addr, &ptl, 0);
				626	if (!ptep)
				627	goto out;
				628
				629	if (pte_write(*ptep)) {
				630	pte_t entry;
				631
				632	swapped = PageSwapCache(page);
				633	flush_cache_page(vma, addr, page_to_pfn(page));
				634	/*
				635	* Ok this is tricky, when get_user_pages_fast() run it doesnt
				636	* take any lock, therefore the check that we are going to make
				637	* with the pagecount against the mapcount is racey and
				638	* O_DIRECT can happen right after the check.
				639	* So we clear the pte and flush the tlb before the check
				640	* this assure us that no O_DIRECT can happen after the check
				641	* or in the middle of the check.
				642	*/
				643	entry = ptep_clear_flush(vma, addr, ptep);
				644	/*
				645	* Check that no O_DIRECT or similar I/O is in progress on the
				646	* page
				647	*/
				648	if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
				649	set_pte_at_notify(mm, addr, ptep, entry);
				650	goto out_unlock;
				651	}
				652	entry = pte_wrprotect(entry);
				653	set_pte_at_notify(mm, addr, ptep, entry);
				654	}
				655	orig_pte = ptep;
				656	err = 0;
				657
				658	out_unlock:
				659	pte_unmap_unlock(ptep, ptl);
				660	out:
				661	return err;
				662	}
				663
				664	/**
				665	* replace_page - replace page in vma by new ksm page
				666	* @vma: vma that holds the pte pointing to oldpage
				667	* @oldpage: the page we are replacing by newpage
				668	* @newpage: the ksm page we replace oldpage by
				669	* @orig_pte: the original value of the pte
				670	*
				671	* Returns 0 on success, -EFAULT on failure.
				672	*/
				673	static int replace_page(struct vm_area_struct vma, struct page oldpage,
				674	struct page *newpage, pte_t orig_pte)
				675	{
				676	struct mm_struct *mm = vma->vm_mm;
				677	pgd_t *pgd;
				678	pud_t *pud;
				679	pmd_t *pmd;
				680	pte_t *ptep;
				681	spinlock_t *ptl;
				682	unsigned long addr;
				683	pgprot_t prot;
				684	int err = -EFAULT;
				685
				686	prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
				687
				688	addr = page_address_in_vma(oldpage, vma);
				689	if (addr == -EFAULT)
				690	goto out;
				691
				692	pgd = pgd_offset(mm, addr);
				693	if (!pgd_present(*pgd))
				694	goto out;
				695
				696	pud = pud_offset(pgd, addr);
				697	if (!pud_present(*pud))
				698	goto out;
				699
				700	pmd = pmd_offset(pud, addr);
				701	if (!pmd_present(*pmd))
				702	goto out;
				703
				704	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				705	if (!pte_same(*ptep, orig_pte)) {
				706	pte_unmap_unlock(ptep, ptl);
				707	goto out;
				708	}
				709
				710	get_page(newpage);
				711	page_add_ksm_rmap(newpage);
				712
				713	flush_cache_page(vma, addr, pte_pfn(*ptep));
				714	ptep_clear_flush(vma, addr, ptep);
				715	set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
				716
				717	page_remove_rmap(oldpage);
				718	put_page(oldpage);
				719
				720	pte_unmap_unlock(ptep, ptl);
				721	err = 0;
				722	out:
				723	return err;
				724	}
				725
				726	/*
				727	* try_to_merge_one_page - take two pages and merge them into one
				728	* @vma: the vma that hold the pte pointing into oldpage
				729	* @oldpage: the page that we want to replace with newpage
				730	* @newpage: the page that we want to map instead of oldpage
				731	*
				732	* Note:
				733	* oldpage should be a PageAnon page, while newpage should be a PageKsm page,
				734	* or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
				735	*
				736	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				737	*/
				738	static int try_to_merge_one_page(struct vm_area_struct *vma,
				739	struct page *oldpage,
				740	struct page *newpage)
				741	{
				742	pte_t orig_pte = __pte(0);
				743	int err = -EFAULT;
				744
				745	if (!(vma->vm_flags & VM_MERGEABLE))
				746	goto out;
				747
				748	if (!PageAnon(oldpage))
				749	goto out;
				750
				751	get_page(newpage);
				752	get_page(oldpage);
				753
				754	/*
				755	* We need the page lock to read a stable PageSwapCache in
				756	* write_protect_page(). We use trylock_page() instead of
				757	* lock_page() because we don't want to wait here - we
				758	* prefer to continue scanning and merging different pages,
				759	* then come back to this page when it is unlocked.
				760	*/
				761	if (!trylock_page(oldpage))
				762	goto out_putpage;
				763	/*
				764	* If this anonymous page is mapped only here, its pte may need
				765	* to be write-protected. If it's mapped elsewhere, all of its
				766	* ptes are necessarily already write-protected. But in either
				767	* case, we need to lock and check page_count is not raised.
				768	*/
				769	if (write_protect_page(vma, oldpage, &orig_pte)) {
				770	unlock_page(oldpage);
				771	goto out_putpage;
				772	}
				773	unlock_page(oldpage);
				774
				775	if (pages_identical(oldpage, newpage))
				776	err = replace_page(vma, oldpage, newpage, orig_pte);
				777
				778	out_putpage:
				779	put_page(oldpage);
				780	put_page(newpage);
				781	out:
				782	return err;
				783	}
				784
				785	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	786	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				787	* but no new kernel page is allocated: kpage must already be a ksm page.
				788	*/
				789	static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
				790	unsigned long addr1,
				791	struct page *page1,
				792	struct page *kpage)
				793	{
				794	struct vm_area_struct *vma;
				795	int err = -EFAULT;
				796
				797	down_read(&mm1->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	798	if (ksm_test_exit(mm1))
				799	goto out;
				800
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	801	vma = find_vma(mm1, addr1);
				802	if (!vma \|\| vma->vm_start > addr1)
				803	goto out;
				804
				805	err = try_to_merge_one_page(vma, page1, kpage);
				806	out:
				807	up_read(&mm1->mmap_sem);
				808	return err;
				809	}
				810
				811	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	812	* try_to_merge_two_pages - take two identical pages and prepare them
				813	* to be merged into one page.
				814	*
				815	* This function returns 0 if we successfully mapped two identical pages
				816	* into one page, -EFAULT otherwise.
				817	*
				818	* Note that this function allocates a new kernel page: if one of the pages
				819	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				820	*/
				821	static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
				822	struct page page1, struct mm_struct mm2,
				823	unsigned long addr2, struct page *page2)
				824	{
				825	struct vm_area_struct *vma;
				826	struct page *kpage;
				827	int err = -EFAULT;
				828
				829	/*
				830	* The number of nodes in the stable tree
				831	* is the number of kernel pages that we hold.
				832	*/
				833	if (ksm_max_kernel_pages &&
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	834	ksm_max_kernel_pages <= ksm_pages_shared)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	835	return err;
				836
				837	kpage = alloc_page(GFP_HIGHUSER);
				838	if (!kpage)
				839	return err;
				840
				841	down_read(&mm1->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	842	if (ksm_test_exit(mm1)) {
				843	up_read(&mm1->mmap_sem);
				844	goto out;
				845	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	846	vma = find_vma(mm1, addr1);
				847	if (!vma \|\| vma->vm_start > addr1) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	848	up_read(&mm1->mmap_sem);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	849	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	850	}
				851
				852	copy_user_highpage(kpage, page1, addr1, vma);
				853	err = try_to_merge_one_page(vma, page1, kpage);
				854	up_read(&mm1->mmap_sem);
				855
				856	if (!err) {
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	857	err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	858	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	859	* If that fails, we have a ksm page with only one pte
				860	* pointing to it: so break it.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	861	*/
				862	if (err)
				863	break_cow(mm1, addr1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	864	}
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	865	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	866	put_page(kpage);
				867	return err;
				868	}
				869
				870	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	871	* stable_tree_search - search page inside the stable tree
				872	* @page: the page that we are searching identical pages to.
				873	* @page2: pointer into identical page that we are holding inside the stable
				874	* tree that we have found.
				875	* @rmap_item: the reverse mapping item
				876	*
				877	* This function checks if there is a page inside the stable tree
				878	* with identical content to the page that we are scanning right now.
				879	*
				880	* This function return rmap_item pointer to the identical item if found,
				881	* NULL otherwise.
				882	*/
				883	static struct rmap_item stable_tree_search(struct page page,
				884	struct page **page2,
				885	struct rmap_item *rmap_item)
				886	{
				887	struct rb_node *node = root_stable_tree.rb_node;
				888
				889	while (node) {
				890	struct rmap_item tree_rmap_item, next_rmap_item;
				891	int ret;
				892
				893	tree_rmap_item = rb_entry(node, struct rmap_item, node);
				894	while (tree_rmap_item) {
				895	BUG_ON(!in_stable_tree(tree_rmap_item));
				896	cond_resched();
				897	page2[0] = get_ksm_page(tree_rmap_item);
				898	if (page2[0])
				899	break;
				900	next_rmap_item = tree_rmap_item->next;
				901	remove_rmap_item_from_tree(tree_rmap_item);
				902	tree_rmap_item = next_rmap_item;
				903	}
				904	if (!tree_rmap_item)
				905	return NULL;
				906
				907	ret = memcmp_pages(page, page2[0]);
				908
				909	if (ret < 0) {
				910	put_page(page2[0]);
				911	node = node->rb_left;
				912	} else if (ret > 0) {
				913	put_page(page2[0]);
				914	node = node->rb_right;
				915	} else {
				916	return tree_rmap_item;
				917	}
				918	}
				919
				920	return NULL;
				921	}
				922
				923	/*
				924	* stable_tree_insert - insert rmap_item pointing to new ksm page
				925	* into the stable tree.
				926	*
				927	* @page: the page that we are searching identical page to inside the stable
				928	* tree.
				929	* @rmap_item: pointer to the reverse mapping item.
				930	*
				931	* This function returns rmap_item if success, NULL otherwise.
				932	*/
				933	static struct rmap_item stable_tree_insert(struct page page,
				934	struct rmap_item *rmap_item)
				935	{
				936	struct rb_node **new = &root_stable_tree.rb_node;
				937	struct rb_node *parent = NULL;
				938
				939	while (*new) {
				940	struct rmap_item tree_rmap_item, next_rmap_item;
				941	struct page *tree_page;
				942	int ret;
				943
				944	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				945	while (tree_rmap_item) {
				946	BUG_ON(!in_stable_tree(tree_rmap_item));
				947	cond_resched();
				948	tree_page = get_ksm_page(tree_rmap_item);
				949	if (tree_page)
				950	break;
				951	next_rmap_item = tree_rmap_item->next;
				952	remove_rmap_item_from_tree(tree_rmap_item);
				953	tree_rmap_item = next_rmap_item;
				954	}
				955	if (!tree_rmap_item)
				956	return NULL;
				957
				958	ret = memcmp_pages(page, tree_page);
				959	put_page(tree_page);
				960
				961	parent = *new;
				962	if (ret < 0)
				963	new = &parent->rb_left;
				964	else if (ret > 0)
				965	new = &parent->rb_right;
				966	else {
				967	/*
				968	* It is not a bug that stable_tree_search() didn't
				969	* find this node: because at that time our page was
				970	* not yet write-protected, so may have changed since.
				971	*/
				972	return NULL;
				973	}
				974	}
				975
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	976	rmap_item->address \|= NODE_FLAG \| STABLE_FLAG;
				977	rmap_item->next = NULL;
				978	rb_link_node(&rmap_item->node, parent, new);
				979	rb_insert_color(&rmap_item->node, &root_stable_tree);
				980
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	981	ksm_pages_shared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	982	return rmap_item;
				983	}
				984
				985	/*
				986	* unstable_tree_search_insert - search and insert items into the unstable tree.
				987	*
				988	* @page: the page that we are going to search for identical page or to insert
				989	* into the unstable tree
				990	* @page2: pointer into identical page that was found inside the unstable tree
				991	* @rmap_item: the reverse mapping item of page
				992	*
				993	* This function searches for a page in the unstable tree identical to the
				994	* page currently being scanned; and if no identical page is found in the
				995	* tree, we insert rmap_item as a new object into the unstable tree.
				996	*
				997	* This function returns pointer to rmap_item found to be identical
				998	* to the currently scanned page, NULL otherwise.
				999	*
				1000	* This function does both searching and inserting, because they share
				1001	* the same walking algorithm in an rbtree.
				1002	*/
				1003	static struct rmap_item unstable_tree_search_insert(struct page page,
				1004	struct page **page2,
				1005	struct rmap_item *rmap_item)
				1006	{
				1007	struct rb_node **new = &root_unstable_tree.rb_node;
				1008	struct rb_node *parent = NULL;
				1009
				1010	while (*new) {
				1011	struct rmap_item *tree_rmap_item;
				1012	int ret;
				1013
				1014	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				1015	page2[0] = get_mergeable_page(tree_rmap_item);
				1016	if (!page2[0])
				1017	return NULL;
				1018
				1019	/*
				1020	* Don't substitute an unswappable ksm page
				1021	* just for one good swappable forked page.
				1022	*/
				1023	if (page == page2[0]) {
				1024	put_page(page2[0]);
				1025	return NULL;
				1026	}
				1027
				1028	ret = memcmp_pages(page, page2[0]);
				1029
				1030	parent = *new;
				1031	if (ret < 0) {
				1032	put_page(page2[0]);
				1033	new = &parent->rb_left;
				1034	} else if (ret > 0) {
				1035	put_page(page2[0]);
				1036	new = &parent->rb_right;
				1037	} else {
				1038	return tree_rmap_item;
				1039	}
				1040	}
				1041
				1042	rmap_item->address \|= NODE_FLAG;
				1043	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
				1044	rb_link_node(&rmap_item->node, parent, new);
				1045	rb_insert_color(&rmap_item->node, &root_unstable_tree);
				1046
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1047	ksm_pages_unshared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1048	return NULL;
				1049	}
				1050
				1051	/*
				1052	* stable_tree_append - add another rmap_item to the linked list of
				1053	* rmap_items hanging off a given node of the stable tree, all sharing
				1054	* the same ksm page.
				1055	*/
				1056	static void stable_tree_append(struct rmap_item *rmap_item,
				1057	struct rmap_item *tree_rmap_item)
				1058	{
				1059	rmap_item->next = tree_rmap_item->next;
				1060	rmap_item->prev = tree_rmap_item;
				1061
				1062	if (tree_rmap_item->next)
				1063	tree_rmap_item->next->prev = rmap_item;
				1064
				1065	tree_rmap_item->next = rmap_item;
				1066	rmap_item->address \|= STABLE_FLAG;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1067
				1068	ksm_pages_sharing++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1069	}
				1070
				1071	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1072	* cmp_and_merge_page - first see if page can be merged into the stable tree;
				1073	* if not, compare checksum to previous and if it's the same, see if page can
				1074	* be inserted into the unstable tree, or merged with a page already there and
				1075	* both transferred to the stable tree.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1076	*
				1077	* @page: the page that we are searching identical page to.
				1078	* @rmap_item: the reverse mapping into the virtual address of this page
				1079	*/
				1080	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				1081	{
				1082	struct page *page2[1];
				1083	struct rmap_item *tree_rmap_item;
				1084	unsigned int checksum;
				1085	int err;
				1086
				1087	if (in_stable_tree(rmap_item))
				1088	remove_rmap_item_from_tree(rmap_item);
				1089
				1090	/* We first start with searching the page inside the stable tree */
				1091	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
				1092	if (tree_rmap_item) {
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1093	if (page == page2[0]) /* forked */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1094	err = 0;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1095	else
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1096	err = try_to_merge_with_ksm_page(rmap_item->mm,
				1097	rmap_item->address,
				1098	page, page2[0]);
				1099	put_page(page2[0]);
				1100
				1101	if (!err) {
				1102	/*
				1103	* The page was successfully merged:
				1104	* add its rmap_item to the stable tree.
				1105	*/
				1106	stable_tree_append(rmap_item, tree_rmap_item);
				1107	}
				1108	return;
				1109	}
				1110
				1111	/*
				1112	* A ksm page might have got here by fork, but its other
				1113	* references have already been removed from the stable tree.
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1114	* Or it might be left over from a break_ksm which failed
				1115	* when the mem_cgroup had reached its limit: try again now.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1116	*/
				1117	if (PageKsm(page))
				1118	break_cow(rmap_item->mm, rmap_item->address);
				1119
				1120	/*
				1121	* In case the hash value of the page was changed from the last time we
				1122	* have calculated it, this page to be changed frequely, therefore we
				1123	* don't want to insert it to the unstable tree, and we don't want to
				1124	* waste our time to search if there is something identical to it there.
				1125	*/
				1126	checksum = calc_checksum(page);
				1127	if (rmap_item->oldchecksum != checksum) {
				1128	rmap_item->oldchecksum = checksum;
				1129	return;
				1130	}
				1131
				1132	tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
				1133	if (tree_rmap_item) {
				1134	err = try_to_merge_two_pages(rmap_item->mm,
				1135	rmap_item->address, page,
				1136	tree_rmap_item->mm,
				1137	tree_rmap_item->address, page2[0]);
				1138	/*
				1139	* As soon as we merge this page, we want to remove the
				1140	* rmap_item of the page we have merged with from the unstable
				1141	* tree, and insert it instead as new node in the stable tree.
				1142	*/
				1143	if (!err) {
				1144	rb_erase(&tree_rmap_item->node, &root_unstable_tree);
				1145	tree_rmap_item->address &= ~NODE_FLAG;
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1146	ksm_pages_unshared--;
				1147
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1148	/*
				1149	* If we fail to insert the page into the stable tree,
				1150	* we will have 2 virtual addresses that are pointing
				1151	* to a ksm page left outside the stable tree,
				1152	* in which case we need to break_cow on both.
				1153	*/
				1154	if (stable_tree_insert(page2[0], tree_rmap_item))
				1155	stable_tree_append(rmap_item, tree_rmap_item);
				1156	else {
				1157	break_cow(tree_rmap_item->mm,
				1158	tree_rmap_item->address);
				1159	break_cow(rmap_item->mm, rmap_item->address);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1160	}
				1161	}
				1162
				1163	put_page(page2[0]);
				1164	}
				1165	}
				1166
				1167	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
				1168	struct list_head *cur,
				1169	unsigned long addr)
				1170	{
				1171	struct rmap_item *rmap_item;
				1172
				1173	while (cur != &mm_slot->rmap_list) {
				1174	rmap_item = list_entry(cur, struct rmap_item, link);
				1175	if ((rmap_item->address & PAGE_MASK) == addr) {
				1176	if (!in_stable_tree(rmap_item))
				1177	remove_rmap_item_from_tree(rmap_item);
				1178	return rmap_item;
				1179	}
				1180	if (rmap_item->address > addr)
				1181	break;
				1182	cur = cur->next;
				1183	remove_rmap_item_from_tree(rmap_item);
				1184	list_del(&rmap_item->link);
				1185	free_rmap_item(rmap_item);
				1186	}
				1187
				1188	rmap_item = alloc_rmap_item();
				1189	if (rmap_item) {
				1190	/* It has already been zeroed */
				1191	rmap_item->mm = mm_slot->mm;
				1192	rmap_item->address = addr;
				1193	list_add_tail(&rmap_item->link, cur);
				1194	}
				1195	return rmap_item;
				1196	}
				1197
				1198	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				1199	{
				1200	struct mm_struct *mm;
				1201	struct mm_slot *slot;
				1202	struct vm_area_struct *vma;
				1203	struct rmap_item *rmap_item;
				1204
				1205	if (list_empty(&ksm_mm_head.mm_list))
				1206	return NULL;
				1207
				1208	slot = ksm_scan.mm_slot;
				1209	if (slot == &ksm_mm_head) {
				1210	root_unstable_tree = RB_ROOT;
				1211
				1212	spin_lock(&ksm_mmlist_lock);
				1213	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1214	ksm_scan.mm_slot = slot;
				1215	spin_unlock(&ksm_mmlist_lock);
				1216	next_mm:
				1217	ksm_scan.address = 0;
				1218	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1219	struct rmap_item, link);
				1220	}
				1221
				1222	mm = slot->mm;
				1223	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1224	if (ksm_test_exit(mm))
				1225	vma = NULL;
				1226	else
				1227	vma = find_vma(mm, ksm_scan.address);
				1228
				1229	for (; vma; vma = vma->vm_next) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1230	if (!(vma->vm_flags & VM_MERGEABLE))
				1231	continue;
				1232	if (ksm_scan.address < vma->vm_start)
				1233	ksm_scan.address = vma->vm_start;
				1234	if (!vma->anon_vma)
				1235	ksm_scan.address = vma->vm_end;
				1236
				1237	while (ksm_scan.address < vma->vm_end) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1238	if (ksm_test_exit(mm))
				1239	break;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1240	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
				1241	if (page && PageAnon(page)) {
				1242	flush_anon_page(vma, *page, ksm_scan.address);
				1243	flush_dcache_page(*page);
				1244	rmap_item = get_next_rmap_item(slot,
				1245	ksm_scan.rmap_item->link.next,
				1246	ksm_scan.address);
				1247	if (rmap_item) {
				1248	ksm_scan.rmap_item = rmap_item;
				1249	ksm_scan.address += PAGE_SIZE;
				1250	} else
				1251	put_page(*page);
				1252	up_read(&mm->mmap_sem);
				1253	return rmap_item;
				1254	}
				1255	if (*page)
				1256	put_page(*page);
				1257	ksm_scan.address += PAGE_SIZE;
				1258	cond_resched();
				1259	}
				1260	}
				1261
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1262	if (ksm_test_exit(mm)) {
				1263	ksm_scan.address = 0;
				1264	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1265	struct rmap_item, link);
				1266	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1267	/*
				1268	* Nuke all the rmap_items that are above this current rmap:
				1269	* because there were no VM_MERGEABLE vmas with such addresses.
				1270	*/
				1271	remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1272
				1273	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1274	ksm_scan.mm_slot = list_entry(slot->mm_list.next,
				1275	struct mm_slot, mm_list);
				1276	if (ksm_scan.address == 0) {
				1277	/*
				1278	* We've completed a full scan of all vmas, holding mmap_sem
				1279	* throughout, and found no VM_MERGEABLE: so do the same as
				1280	* __ksm_exit does to remove this mm from all our lists now.
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1281	* This applies either when cleaning up after __ksm_exit
				1282	* (but beware: we can reach here even before __ksm_exit),
				1283	* or when all VM_MERGEABLE areas have been unmapped (and
				1284	* mmap_sem then protects against race with MADV_MERGEABLE).
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1285	*/
				1286	hlist_del(&slot->link);
				1287	list_del(&slot->mm_list);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1288	spin_unlock(&ksm_mmlist_lock);
				1289
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1290	free_mm_slot(slot);
				1291	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1292	up_read(&mm->mmap_sem);
				1293	mmdrop(mm);
				1294	} else {
				1295	spin_unlock(&ksm_mmlist_lock);
				1296	up_read(&mm->mmap_sem);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1297	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1298
				1299	/* Repeat until we've completed scanning the whole list */
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1300	slot = ksm_scan.mm_slot;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1301	if (slot != &ksm_mm_head)
				1302	goto next_mm;
				1303
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1304	ksm_scan.seqnr++;
				1305	return NULL;
				1306	}
				1307
				1308	/**
				1309	* ksm_do_scan - the ksm scanner main worker function.
				1310	* @scan_npages - number of pages we want to scan before we return.
				1311	*/
				1312	static void ksm_do_scan(unsigned int scan_npages)
				1313	{
				1314	struct rmap_item *rmap_item;
				1315	struct page *page;
				1316
				1317	while (scan_npages--) {
				1318	cond_resched();
				1319	rmap_item = scan_get_next_rmap_item(&page);
				1320	if (!rmap_item)
				1321	return;
				1322	if (!PageKsm(page) \|\| !in_stable_tree(rmap_item))
				1323	cmp_and_merge_page(page, rmap_item);
Hugh Dickins	26465d3	2009-09-21 17:02:12 -0700	[diff] [blame]	1324	else if (page_mapcount(page) == 1) {
				1325	/*
				1326	* Replace now-unshared ksm page by ordinary page.
				1327	*/
				1328	break_cow(rmap_item->mm, rmap_item->address);
				1329	remove_rmap_item_from_tree(rmap_item);
				1330	rmap_item->oldchecksum = calc_checksum(page);
				1331	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1332	put_page(page);
				1333	}
				1334	}
				1335
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1336	static int ksmd_should_run(void)
				1337	{
				1338	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
				1339	}
				1340
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1341	static int ksm_scan_thread(void *nothing)
				1342	{
Izik Eidus	339aa62	2009-09-21 17:02:07 -0700	[diff] [blame]	1343	set_user_nice(current, 5);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1344
				1345	while (!kthread_should_stop()) {
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1346	mutex_lock(&ksm_thread_mutex);
				1347	if (ksmd_should_run())
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1348	ksm_do_scan(ksm_thread_pages_to_scan);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1349	mutex_unlock(&ksm_thread_mutex);
				1350
				1351	if (ksmd_should_run()) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1352	schedule_timeout_interruptible(
				1353	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				1354	} else {
				1355	wait_event_interruptible(ksm_thread_wait,
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1356	ksmd_should_run() \|\| kthread_should_stop());
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1357	}
				1358	}
				1359	return 0;
				1360	}
				1361
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1362	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				1363	unsigned long end, int advice, unsigned long *vm_flags)
				1364	{
				1365	struct mm_struct *mm = vma->vm_mm;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1366	int err;
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1367
				1368	switch (advice) {
				1369	case MADV_MERGEABLE:
				1370	/*
				1371	* Be somewhat over-protective for now!
				1372	*/
				1373	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				1374	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
				1375	VM_RESERVED \| VM_HUGETLB \| VM_INSERTPAGE \|
				1376	VM_MIXEDMAP \| VM_SAO))
				1377	return 0; /* just ignore the advice */
				1378
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1379	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
				1380	err = __ksm_enter(mm);
				1381	if (err)
				1382	return err;
				1383	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1384
				1385	*vm_flags \|= VM_MERGEABLE;
				1386	break;
				1387
				1388	case MADV_UNMERGEABLE:
				1389	if (!(*vm_flags & VM_MERGEABLE))
				1390	return 0; /* just ignore the advice */
				1391
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1392	if (vma->anon_vma) {
				1393	err = unmerge_ksm_pages(vma, start, end);
				1394	if (err)
				1395	return err;
				1396	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1397
				1398	*vm_flags &= ~VM_MERGEABLE;
				1399	break;
				1400	}
				1401
				1402	return 0;
				1403	}
				1404
				1405	int __ksm_enter(struct mm_struct *mm)
				1406	{
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1407	struct mm_slot *mm_slot;
				1408	int needs_wakeup;
				1409
				1410	mm_slot = alloc_mm_slot();
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1411	if (!mm_slot)
				1412	return -ENOMEM;
				1413
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1414	/* Check ksm_run too? Would need tighter locking */
				1415	needs_wakeup = list_empty(&ksm_mm_head.mm_list);
				1416
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1417	spin_lock(&ksm_mmlist_lock);
				1418	insert_to_mm_slots_hash(mm, mm_slot);
				1419	/*
				1420	* Insert just behind the scanning cursor, to let the area settle
				1421	* down a little; when fork is followed by immediate exec, we don't
				1422	* want ksmd to waste time setting up and tearing down an rmap_list.
				1423	*/
				1424	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
				1425	spin_unlock(&ksm_mmlist_lock);
				1426
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1427	set_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1428	atomic_inc(&mm->mm_count);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1429
				1430	if (needs_wakeup)
				1431	wake_up_interruptible(&ksm_thread_wait);
				1432
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1433	return 0;
				1434	}
				1435
Andrea Arcangeli	1c2fb7a	2009-09-21 17:02:22 -0700	[diff] [blame]	1436	void __ksm_exit(struct mm_struct *mm)
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1437	{
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1438	struct mm_slot *mm_slot;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1439	int easy_to_free = 0;
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1440
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1441	/*
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1442	* This process is exiting: if it's straightforward (as is the
				1443	* case when ksmd was never running), free mm_slot immediately.
				1444	* But if it's at the cursor or has rmap_items linked to it, use
				1445	* mmap_sem to synchronize with any break_cows before pagetables
				1446	* are freed, and leave the mm_slot on the list for ksmd to free.
				1447	* Beware: ksm may already have noticed it exiting and freed the slot.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1448	*/
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1449
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1450	spin_lock(&ksm_mmlist_lock);
				1451	mm_slot = get_mm_slot(mm);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1452	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
				1453	if (list_empty(&mm_slot->rmap_list)) {
				1454	hlist_del(&mm_slot->link);
				1455	list_del(&mm_slot->mm_list);
				1456	easy_to_free = 1;
				1457	} else {
				1458	list_move(&mm_slot->mm_list,
				1459	&ksm_scan.mm_slot->mm_list);
				1460	}
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1461	}
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1462	spin_unlock(&ksm_mmlist_lock);
				1463
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1464	if (easy_to_free) {
				1465	free_mm_slot(mm_slot);
				1466	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				1467	mmdrop(mm);
				1468	} else if (mm_slot) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1469	down_write(&mm->mmap_sem);
				1470	up_write(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1471	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1472	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1473
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1474	#ifdef CONFIG_SYSFS
				1475	/*
				1476	* This all compiles without CONFIG_SYSFS, but is a waste of space.
				1477	*/
				1478
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1479	#define KSM_ATTR_RO(_name) \
				1480	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				1481	#define KSM_ATTR(_name) \
				1482	static struct kobj_attribute _name##_attr = \
				1483	__ATTR(_name, 0644, _name##_show, _name##_store)
				1484
				1485	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				1486	struct kobj_attribute attr, char buf)
				1487	{
				1488	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				1489	}
				1490
				1491	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				1492	struct kobj_attribute *attr,
				1493	const char *buf, size_t count)
				1494	{
				1495	unsigned long msecs;
				1496	int err;
				1497
				1498	err = strict_strtoul(buf, 10, &msecs);
				1499	if (err \|\| msecs > UINT_MAX)
				1500	return -EINVAL;
				1501
				1502	ksm_thread_sleep_millisecs = msecs;
				1503
				1504	return count;
				1505	}
				1506	KSM_ATTR(sleep_millisecs);
				1507
				1508	static ssize_t pages_to_scan_show(struct kobject *kobj,
				1509	struct kobj_attribute attr, char buf)
				1510	{
				1511	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				1512	}
				1513
				1514	static ssize_t pages_to_scan_store(struct kobject *kobj,
				1515	struct kobj_attribute *attr,
				1516	const char *buf, size_t count)
				1517	{
				1518	int err;
				1519	unsigned long nr_pages;
				1520
				1521	err = strict_strtoul(buf, 10, &nr_pages);
				1522	if (err \|\| nr_pages > UINT_MAX)
				1523	return -EINVAL;
				1524
				1525	ksm_thread_pages_to_scan = nr_pages;
				1526
				1527	return count;
				1528	}
				1529	KSM_ATTR(pages_to_scan);
				1530
				1531	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				1532	char *buf)
				1533	{
				1534	return sprintf(buf, "%u\n", ksm_run);
				1535	}
				1536
				1537	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				1538	const char *buf, size_t count)
				1539	{
				1540	int err;
				1541	unsigned long flags;
				1542
				1543	err = strict_strtoul(buf, 10, &flags);
				1544	if (err \|\| flags > UINT_MAX)
				1545	return -EINVAL;
				1546	if (flags > KSM_RUN_UNMERGE)
				1547	return -EINVAL;
				1548
				1549	/*
				1550	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				1551	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1552	* breaking COW to free the unswappable pages_shared (but leaves
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1553	* mm_slots on the list for when ksmd may be set running again).
				1554	*/
				1555
				1556	mutex_lock(&ksm_thread_mutex);
				1557	if (ksm_run != flags) {
				1558	ksm_run = flags;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1559	if (flags & KSM_RUN_UNMERGE) {
				1560	err = unmerge_and_remove_all_rmap_items();
				1561	if (err) {
				1562	ksm_run = KSM_RUN_STOP;
				1563	count = err;
				1564	}
				1565	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1566	}
				1567	mutex_unlock(&ksm_thread_mutex);
				1568
				1569	if (flags & KSM_RUN_MERGE)
				1570	wake_up_interruptible(&ksm_thread_wait);
				1571
				1572	return count;
				1573	}
				1574	KSM_ATTR(run);
				1575
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1576	static ssize_t max_kernel_pages_store(struct kobject *kobj,
				1577	struct kobj_attribute *attr,
				1578	const char *buf, size_t count)
				1579	{
				1580	int err;
				1581	unsigned long nr_pages;
				1582
				1583	err = strict_strtoul(buf, 10, &nr_pages);
				1584	if (err)
				1585	return -EINVAL;
				1586
				1587	ksm_max_kernel_pages = nr_pages;
				1588
				1589	return count;
				1590	}
				1591
				1592	static ssize_t max_kernel_pages_show(struct kobject *kobj,
				1593	struct kobj_attribute attr, char buf)
				1594	{
				1595	return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
				1596	}
				1597	KSM_ATTR(max_kernel_pages);
				1598
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1599	static ssize_t pages_shared_show(struct kobject *kobj,
				1600	struct kobj_attribute attr, char buf)
				1601	{
				1602	return sprintf(buf, "%lu\n", ksm_pages_shared);
				1603	}
				1604	KSM_ATTR_RO(pages_shared);
				1605
				1606	static ssize_t pages_sharing_show(struct kobject *kobj,
				1607	struct kobj_attribute attr, char buf)
				1608	{
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1609	return sprintf(buf, "%lu\n", ksm_pages_sharing);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1610	}
				1611	KSM_ATTR_RO(pages_sharing);
				1612
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1613	static ssize_t pages_unshared_show(struct kobject *kobj,
				1614	struct kobj_attribute attr, char buf)
				1615	{
				1616	return sprintf(buf, "%lu\n", ksm_pages_unshared);
				1617	}
				1618	KSM_ATTR_RO(pages_unshared);
				1619
				1620	static ssize_t pages_volatile_show(struct kobject *kobj,
				1621	struct kobj_attribute attr, char buf)
				1622	{
				1623	long ksm_pages_volatile;
				1624
				1625	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
				1626	- ksm_pages_sharing - ksm_pages_unshared;
				1627	/*
				1628	* It was not worth any locking to calculate that statistic,
				1629	* but it might therefore sometimes be negative: conceal that.
				1630	*/
				1631	if (ksm_pages_volatile < 0)
				1632	ksm_pages_volatile = 0;
				1633	return sprintf(buf, "%ld\n", ksm_pages_volatile);
				1634	}
				1635	KSM_ATTR_RO(pages_volatile);
				1636
				1637	static ssize_t full_scans_show(struct kobject *kobj,
				1638	struct kobj_attribute attr, char buf)
				1639	{
				1640	return sprintf(buf, "%lu\n", ksm_scan.seqnr);
				1641	}
				1642	KSM_ATTR_RO(full_scans);
				1643
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1644	static struct attribute *ksm_attrs[] = {
				1645	&sleep_millisecs_attr.attr,
				1646	&pages_to_scan_attr.attr,
				1647	&run_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1648	&max_kernel_pages_attr.attr,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1649	&pages_shared_attr.attr,
				1650	&pages_sharing_attr.attr,
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1651	&pages_unshared_attr.attr,
				1652	&pages_volatile_attr.attr,
				1653	&full_scans_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1654	NULL,
				1655	};
				1656
				1657	static struct attribute_group ksm_attr_group = {
				1658	.attrs = ksm_attrs,
				1659	.name = "ksm",
				1660	};
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1661	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1662
				1663	static int __init ksm_init(void)
				1664	{
				1665	struct task_struct *ksm_thread;
				1666	int err;
				1667
				1668	err = ksm_slab_init();
				1669	if (err)
				1670	goto out;
				1671
				1672	err = mm_slots_hash_init();
				1673	if (err)
				1674	goto out_free1;
				1675
				1676	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				1677	if (IS_ERR(ksm_thread)) {
				1678	printk(KERN_ERR "ksm: creating kthread failed\n");
				1679	err = PTR_ERR(ksm_thread);
				1680	goto out_free2;
				1681	}
				1682
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1683	#ifdef CONFIG_SYSFS
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1684	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				1685	if (err) {
				1686	printk(KERN_ERR "ksm: register sysfs failed\n");
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1687	kthread_stop(ksm_thread);
				1688	goto out_free2;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1689	}
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1690	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1691
				1692	return 0;
				1693
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1694	out_free2:
				1695	mm_slots_hash_free();
				1696	out_free1:
				1697	ksm_slab_free();
				1698	out:
				1699	return err;
				1700	}
				1701	module_init(ksm_init)