Blame - mm/ksm.c - kernel/msm-4.9

blob: f7edac356f465275031110db70c1e57aafbc5cda [file] [log] [blame]

Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	7	* Copyright (C) 2008-2009 Red Hat, Inc.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	12	* Hugh Dickins
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2.
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	15	*/
				16
				17	#include <linux/errno.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	18	#include <linux/mm.h>
				19	#include <linux/fs.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	20	#include <linux/mman.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	21	#include <linux/sched.h>
				22	#include <linux/rwsem.h>
				23	#include <linux/pagemap.h>
				24	#include <linux/rmap.h>
				25	#include <linux/spinlock.h>
				26	#include <linux/jhash.h>
				27	#include <linux/delay.h>
				28	#include <linux/kthread.h>
				29	#include <linux/wait.h>
				30	#include <linux/slab.h>
				31	#include <linux/rbtree.h>
				32	#include <linux/mmu_notifier.h>
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame^]	33	#include <linux/swap.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	34	#include <linux/ksm.h>
				35
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	36	#include <asm/tlbflush.h>
				37
				38	/*
				39	* A few notes about the KSM scanning process,
				40	* to make it easier to understand the data structures below:
				41	*
				42	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				43	* contents into a data structure that holds pointers to the pages' locations.
				44	*
				45	* Since the contents of the pages may change at any moment, KSM cannot just
				46	* insert the pages into a normal sorted tree and expect it to find anything.
				47	* Therefore KSM uses two data structures - the stable and the unstable tree.
				48	*
				49	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				50	* by their contents. Because each such page is write-protected, searching on
				51	* this tree is fully assured to be working (except when pages are unmapped),
				52	* and therefore this tree is called the stable tree.
				53	*
				54	* In addition to the stable tree, KSM uses a second data structure called the
				55	* unstable tree: this tree holds pointers to pages which have been found to
				56	* be "unchanged for a period of time". The unstable tree sorts these pages
				57	* by their contents, but since they are not write-protected, KSM cannot rely
				58	* upon the unstable tree to work correctly - the unstable tree is liable to
				59	* be corrupted as its contents are modified, and so it is called unstable.
				60	*
				61	* KSM solves this problem by several techniques:
				62	*
				63	* 1) The unstable tree is flushed every time KSM completes scanning all
				64	* memory areas, and then the tree is rebuilt again from the beginning.
				65	* 2) KSM will only insert into the unstable tree, pages whose hash value
				66	* has not changed since the previous scan of all memory areas.
				67	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				68	* colors of the nodes and not on their contents, assuring that even when
				69	* the tree gets "corrupted" it won't get out of balance, so scanning time
				70	* remains the same (also, searching and inserting nodes in an rbtree uses
				71	* the same algorithm, so we have no overhead when we flush and rebuild).
				72	* 4) KSM never flushes the stable tree, which means that even if it were to
				73	* take 10 attempts to find a page in the unstable tree, once it is found,
				74	* it is secured in the stable tree. (When we scan a new page, we first
				75	* compare it against the stable tree, and then against the unstable tree.)
				76	*/
				77
				78	/**
				79	* struct mm_slot - ksm information per mm that is being scanned
				80	* @link: link to the mm_slots hash list
				81	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
				82	* @rmap_list: head for this mm_slot's list of rmap_items
				83	* @mm: the mm that this information is valid for
				84	*/
				85	struct mm_slot {
				86	struct hlist_node link;
				87	struct list_head mm_list;
				88	struct list_head rmap_list;
				89	struct mm_struct *mm;
				90	};
				91
				92	/**
				93	* struct ksm_scan - cursor for scanning
				94	* @mm_slot: the current mm_slot we are scanning
				95	* @address: the next address inside that to be scanned
				96	* @rmap_item: the current rmap that we are scanning inside the rmap_list
				97	* @seqnr: count of completed full scans (needed when removing unstable node)
				98	*
				99	* There is only the one ksm_scan instance of this cursor structure.
				100	*/
				101	struct ksm_scan {
				102	struct mm_slot *mm_slot;
				103	unsigned long address;
				104	struct rmap_item *rmap_item;
				105	unsigned long seqnr;
				106	};
				107
				108	/**
				109	* struct rmap_item - reverse mapping item for virtual addresses
				110	* @link: link into mm_slot's rmap_list (rmap_list is per mm)
				111	* @mm: the memory structure this rmap_item is pointing into
				112	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				113	* @oldchecksum: previous checksum of the page at that virtual address
				114	* @node: rb_node of this rmap_item in either unstable or stable tree
				115	* @next: next rmap_item hanging off the same node of the stable tree
				116	* @prev: previous rmap_item hanging off the same node of the stable tree
				117	*/
				118	struct rmap_item {
				119	struct list_head link;
				120	struct mm_struct *mm;
				121	unsigned long address; /* + low bits used for flags below */
				122	union {
				123	unsigned int oldchecksum; /* when unstable */
				124	struct rmap_item next; / when stable */
				125	};
				126	union {
				127	struct rb_node node; /* when tree node */
				128	struct rmap_item prev; / in stable list */
				129	};
				130	};
				131
				132	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
				133	#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
				134	#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
				135
				136	/* The stable and unstable tree heads */
				137	static struct rb_root root_stable_tree = RB_ROOT;
				138	static struct rb_root root_unstable_tree = RB_ROOT;
				139
				140	#define MM_SLOTS_HASH_HEADS 1024
				141	static struct hlist_head *mm_slots_hash;
				142
				143	static struct mm_slot ksm_mm_head = {
				144	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				145	};
				146	static struct ksm_scan ksm_scan = {
				147	.mm_slot = &ksm_mm_head,
				148	};
				149
				150	static struct kmem_cache *rmap_item_cache;
				151	static struct kmem_cache *mm_slot_cache;
				152
				153	/* The number of nodes in the stable tree */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	154	static unsigned long ksm_pages_shared;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	155
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	156	/* The number of page slots additionally sharing those nodes */
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	157	static unsigned long ksm_pages_sharing;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	158
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	159	/* The number of nodes in the unstable tree */
				160	static unsigned long ksm_pages_unshared;
				161
				162	/* The number of rmap_items in use: to calculate pages_volatile */
				163	static unsigned long ksm_rmap_items;
				164
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	165	/* Limit on the number of unswappable pages used */
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame^]	166	static unsigned long ksm_max_kernel_pages;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	167
				168	/* Number of pages ksmd should scan in one batch */
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame^]	169	static unsigned int ksm_thread_pages_to_scan = 100;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	170
				171	/* Milliseconds ksmd should sleep between batches */
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	172	static unsigned int ksm_thread_sleep_millisecs = 20;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	173
				174	#define KSM_RUN_STOP 0
				175	#define KSM_RUN_MERGE 1
				176	#define KSM_RUN_UNMERGE 2
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame^]	177	static unsigned int ksm_run = KSM_RUN_STOP;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	178
				179	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				180	static DEFINE_MUTEX(ksm_thread_mutex);
				181	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				182
				183	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				184	sizeof(struct __struct), __alignof__(struct __struct),\
				185	(__flags), NULL)
				186
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame^]	187	static void __init ksm_init_max_kernel_pages(void)
				188	{
				189	ksm_max_kernel_pages = nr_free_buffer_pages() / 4;
				190	}
				191
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	192	static int __init ksm_slab_init(void)
				193	{
				194	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				195	if (!rmap_item_cache)
				196	goto out;
				197
				198	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				199	if (!mm_slot_cache)
				200	goto out_free;
				201
				202	return 0;
				203
				204	out_free:
				205	kmem_cache_destroy(rmap_item_cache);
				206	out:
				207	return -ENOMEM;
				208	}
				209
				210	static void __init ksm_slab_free(void)
				211	{
				212	kmem_cache_destroy(mm_slot_cache);
				213	kmem_cache_destroy(rmap_item_cache);
				214	mm_slot_cache = NULL;
				215	}
				216
				217	static inline struct rmap_item *alloc_rmap_item(void)
				218	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	219	struct rmap_item *rmap_item;
				220
				221	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
				222	if (rmap_item)
				223	ksm_rmap_items++;
				224	return rmap_item;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	225	}
				226
				227	static inline void free_rmap_item(struct rmap_item *rmap_item)
				228	{
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	229	ksm_rmap_items--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	230	rmap_item->mm = NULL; /* debug safety */
				231	kmem_cache_free(rmap_item_cache, rmap_item);
				232	}
				233
				234	static inline struct mm_slot *alloc_mm_slot(void)
				235	{
				236	if (!mm_slot_cache) /* initialization failed */
				237	return NULL;
				238	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				239	}
				240
				241	static inline void free_mm_slot(struct mm_slot *mm_slot)
				242	{
				243	kmem_cache_free(mm_slot_cache, mm_slot);
				244	}
				245
				246	static int __init mm_slots_hash_init(void)
				247	{
				248	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
				249	GFP_KERNEL);
				250	if (!mm_slots_hash)
				251	return -ENOMEM;
				252	return 0;
				253	}
				254
				255	static void __init mm_slots_hash_free(void)
				256	{
				257	kfree(mm_slots_hash);
				258	}
				259
				260	static struct mm_slot get_mm_slot(struct mm_struct mm)
				261	{
				262	struct mm_slot *mm_slot;
				263	struct hlist_head *bucket;
				264	struct hlist_node *node;
				265
				266	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				267	% MM_SLOTS_HASH_HEADS];
				268	hlist_for_each_entry(mm_slot, node, bucket, link) {
				269	if (mm == mm_slot->mm)
				270	return mm_slot;
				271	}
				272	return NULL;
				273	}
				274
				275	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				276	struct mm_slot *mm_slot)
				277	{
				278	struct hlist_head *bucket;
				279
				280	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				281	% MM_SLOTS_HASH_HEADS];
				282	mm_slot->mm = mm;
				283	INIT_LIST_HEAD(&mm_slot->rmap_list);
				284	hlist_add_head(&mm_slot->link, bucket);
				285	}
				286
				287	static inline int in_stable_tree(struct rmap_item *rmap_item)
				288	{
				289	return rmap_item->address & STABLE_FLAG;
				290	}
				291
				292	/*
Hugh Dickins	a913e18	2009-09-21 17:02:26 -0700	[diff] [blame]	293	* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
				294	* page tables after it has passed through ksm_exit() - which, if necessary,
				295	* takes mmap_sem briefly to serialize against them. ksm_exit() does not set
				296	* a special flag: they can just back out as soon as mm_users goes to zero.
				297	* ksm_test_exit() is used throughout to make this test for exit: in some
				298	* places for correctness, in some places just to avoid unnecessary work.
				299	*/
				300	static inline bool ksm_test_exit(struct mm_struct *mm)
				301	{
				302	return atomic_read(&mm->mm_users) == 0;
				303	}
				304
				305	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	306	* We use break_ksm to break COW on a ksm page: it's a stripped down
				307	*
				308	* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
				309	* put_page(page);
				310	*
				311	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				312	* in case the application has unmapped and remapped mm,addr meanwhile.
				313	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				314	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
				315	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	316	static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	317	{
				318	struct page *page;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	319	int ret = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	320
				321	do {
				322	cond_resched();
				323	page = follow_page(vma, addr, FOLL_GET);
				324	if (!page)
				325	break;
				326	if (PageKsm(page))
				327	ret = handle_mm_fault(vma->vm_mm, vma, addr,
				328	FAULT_FLAG_WRITE);
				329	else
				330	ret = VM_FAULT_WRITE;
				331	put_page(page);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	332	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS \| VM_FAULT_OOM)));
				333	/*
				334	* We must loop because handle_mm_fault() may back out if there's
				335	* any difficulty e.g. if pte accessed bit gets updated concurrently.
				336	*
				337	* VM_FAULT_WRITE is what we have been hoping for: it indicates that
				338	* COW has been broken, even if the vma does not permit VM_WRITE;
				339	* but note that a concurrent fault might break PageKsm for us.
				340	*
				341	* VM_FAULT_SIGBUS could occur if we race with truncation of the
				342	* backing file, which also invalidates anonymous pages: that's
				343	* okay, that truncation will have unmapped the PageKsm for us.
				344	*
				345	* VM_FAULT_OOM: at the time of writing (late July 2009), setting
				346	* aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
				347	* current task has TIF_MEMDIE set, and will be OOM killed on return
				348	* to user; and ksmd, having no mm, would never be chosen for that.
				349	*
				350	* But if the mm is in a limited mem_cgroup, then the fault may fail
				351	* with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
				352	* even ksmd can fail in this way - though it's usually breaking ksm
				353	* just to undo a merge it made a moment before, so unlikely to oom.
				354	*
				355	* That's a pity: we might therefore have more kernel pages allocated
				356	* than we're counting as nodes in the stable tree; but ksm_do_scan
				357	* will retry to break_cow on each pass, so should recover the page
				358	* in due course. The important thing is to not let VM_MERGEABLE
				359	* be cleared while any such pages might remain in the area.
				360	*/
				361	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	362	}
				363
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	364	static void break_cow(struct mm_struct *mm, unsigned long addr)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	365	{
				366	struct vm_area_struct *vma;
				367
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	368	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	369	if (ksm_test_exit(mm))
				370	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	371	vma = find_vma(mm, addr);
				372	if (!vma \|\| vma->vm_start > addr)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	373	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	374	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	375	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	376	break_ksm(vma, addr);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	377	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	378	up_read(&mm->mmap_sem);
				379	}
				380
				381	static struct page get_mergeable_page(struct rmap_item rmap_item)
				382	{
				383	struct mm_struct *mm = rmap_item->mm;
				384	unsigned long addr = rmap_item->address;
				385	struct vm_area_struct *vma;
				386	struct page *page;
				387
				388	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	389	if (ksm_test_exit(mm))
				390	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	391	vma = find_vma(mm, addr);
				392	if (!vma \|\| vma->vm_start > addr)
				393	goto out;
				394	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				395	goto out;
				396
				397	page = follow_page(vma, addr, FOLL_GET);
				398	if (!page)
				399	goto out;
				400	if (PageAnon(page)) {
				401	flush_anon_page(vma, page, addr);
				402	flush_dcache_page(page);
				403	} else {
				404	put_page(page);
				405	out: page = NULL;
				406	}
				407	up_read(&mm->mmap_sem);
				408	return page;
				409	}
				410
				411	/*
				412	* get_ksm_page: checks if the page at the virtual address in rmap_item
				413	* is still PageKsm, in which case we can trust the content of the page,
				414	* and it returns the gotten page; but NULL if the page has been zapped.
				415	*/
				416	static struct page get_ksm_page(struct rmap_item rmap_item)
				417	{
				418	struct page *page;
				419
				420	page = get_mergeable_page(rmap_item);
				421	if (page && !PageKsm(page)) {
				422	put_page(page);
				423	page = NULL;
				424	}
				425	return page;
				426	}
				427
				428	/*
				429	* Removing rmap_item from stable or unstable tree.
				430	* This function will clean the information from the stable/unstable tree.
				431	*/
				432	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				433	{
				434	if (in_stable_tree(rmap_item)) {
				435	struct rmap_item *next_item = rmap_item->next;
				436
				437	if (rmap_item->address & NODE_FLAG) {
				438	if (next_item) {
				439	rb_replace_node(&rmap_item->node,
				440	&next_item->node,
				441	&root_stable_tree);
				442	next_item->address \|= NODE_FLAG;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	443	ksm_pages_sharing--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	444	} else {
				445	rb_erase(&rmap_item->node, &root_stable_tree);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	446	ksm_pages_shared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	447	}
				448	} else {
				449	struct rmap_item *prev_item = rmap_item->prev;
				450
				451	BUG_ON(prev_item->next != rmap_item);
				452	prev_item->next = next_item;
				453	if (next_item) {
				454	BUG_ON(next_item->prev != rmap_item);
				455	next_item->prev = rmap_item->prev;
				456	}
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	457	ksm_pages_sharing--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	458	}
				459
				460	rmap_item->next = NULL;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	461
				462	} else if (rmap_item->address & NODE_FLAG) {
				463	unsigned char age;
				464	/*
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	465	* Usually ksmd can and must skip the rb_erase, because
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	466	* root_unstable_tree was already reset to RB_ROOT.
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	467	* But be careful when an mm is exiting: do the rb_erase
				468	* if this rmap_item was inserted by this scan, rather
				469	* than left over from before.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	470	*/
				471	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	472	BUG_ON(age > 1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	473	if (!age)
				474	rb_erase(&rmap_item->node, &root_unstable_tree);
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	475	ksm_pages_unshared--;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	476	}
				477
				478	rmap_item->address &= PAGE_MASK;
				479
				480	cond_resched(); /* we're called from many long loops */
				481	}
				482
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	483	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
				484	struct list_head *cur)
				485	{
				486	struct rmap_item *rmap_item;
				487
				488	while (cur != &mm_slot->rmap_list) {
				489	rmap_item = list_entry(cur, struct rmap_item, link);
				490	cur = cur->next;
				491	remove_rmap_item_from_tree(rmap_item);
				492	list_del(&rmap_item->link);
				493	free_rmap_item(rmap_item);
				494	}
				495	}
				496
				497	/*
				498	* Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
				499	* than check every pte of a given vma, the locking doesn't quite work for
				500	* that - an rmap_item is assigned to the stable tree after inserting ksm
				501	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				502	* rmap_items from parent to child at fork time (so as not to waste time
				503	* if exit comes before the next scan reaches it).
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	504	*
				505	* Similarly, although we'd like to remove rmap_items (so updating counts
				506	* and freeing memory) when unmerging an area, it's easier to leave that
				507	* to the next pass of ksmd - consider, for example, how ksmd might be
				508	* in cmp_and_merge_page on one of the rmap_items we would be removing.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	509	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	510	static int unmerge_ksm_pages(struct vm_area_struct *vma,
				511	unsigned long start, unsigned long end)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	512	{
				513	unsigned long addr;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	514	int err = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	515
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	516	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	517	if (ksm_test_exit(vma->vm_mm))
				518	break;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	519	if (signal_pending(current))
				520	err = -ERESTARTSYS;
				521	else
				522	err = break_ksm(vma, addr);
				523	}
				524	return err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	525	}
				526
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	527	#ifdef CONFIG_SYSFS
				528	/*
				529	* Only called through the sysfs control interface:
				530	*/
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	531	static int unmerge_and_remove_all_rmap_items(void)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	532	{
				533	struct mm_slot *mm_slot;
				534	struct mm_struct *mm;
				535	struct vm_area_struct *vma;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	536	int err = 0;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	537
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	538	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	539	ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next,
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	540	struct mm_slot, mm_list);
				541	spin_unlock(&ksm_mmlist_lock);
				542
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	543	for (mm_slot = ksm_scan.mm_slot;
				544	mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	545	mm = mm_slot->mm;
				546	down_read(&mm->mmap_sem);
				547	for (vma = mm->mmap; vma; vma = vma->vm_next) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	548	if (ksm_test_exit(mm))
				549	break;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	550	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				551	continue;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	552	err = unmerge_ksm_pages(vma,
				553	vma->vm_start, vma->vm_end);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	554	if (err)
				555	goto error;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	556	}
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	557
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	558	remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	559
				560	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	561	ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	562	struct mm_slot, mm_list);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	563	if (ksm_test_exit(mm)) {
				564	hlist_del(&mm_slot->link);
				565	list_del(&mm_slot->mm_list);
				566	spin_unlock(&ksm_mmlist_lock);
				567
				568	free_mm_slot(mm_slot);
				569	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				570	up_read(&mm->mmap_sem);
				571	mmdrop(mm);
				572	} else {
				573	spin_unlock(&ksm_mmlist_lock);
				574	up_read(&mm->mmap_sem);
				575	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	576	}
				577
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	578	ksm_scan.seqnr = 0;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	579	return 0;
				580
				581	error:
				582	up_read(&mm->mmap_sem);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	583	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	584	ksm_scan.mm_slot = &ksm_mm_head;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	585	spin_unlock(&ksm_mmlist_lock);
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	586	return err;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	587	}
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	588	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	589
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	590	static u32 calc_checksum(struct page *page)
				591	{
				592	u32 checksum;
				593	void *addr = kmap_atomic(page, KM_USER0);
				594	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
				595	kunmap_atomic(addr, KM_USER0);
				596	return checksum;
				597	}
				598
				599	static int memcmp_pages(struct page page1, struct page page2)
				600	{
				601	char addr1, addr2;
				602	int ret;
				603
				604	addr1 = kmap_atomic(page1, KM_USER0);
				605	addr2 = kmap_atomic(page2, KM_USER1);
				606	ret = memcmp(addr1, addr2, PAGE_SIZE);
				607	kunmap_atomic(addr2, KM_USER1);
				608	kunmap_atomic(addr1, KM_USER0);
				609	return ret;
				610	}
				611
				612	static inline int pages_identical(struct page page1, struct page page2)
				613	{
				614	return !memcmp_pages(page1, page2);
				615	}
				616
				617	static int write_protect_page(struct vm_area_struct vma, struct page page,
				618	pte_t *orig_pte)
				619	{
				620	struct mm_struct *mm = vma->vm_mm;
				621	unsigned long addr;
				622	pte_t *ptep;
				623	spinlock_t *ptl;
				624	int swapped;
				625	int err = -EFAULT;
				626
				627	addr = page_address_in_vma(page, vma);
				628	if (addr == -EFAULT)
				629	goto out;
				630
				631	ptep = page_check_address(page, mm, addr, &ptl, 0);
				632	if (!ptep)
				633	goto out;
				634
				635	if (pte_write(*ptep)) {
				636	pte_t entry;
				637
				638	swapped = PageSwapCache(page);
				639	flush_cache_page(vma, addr, page_to_pfn(page));
				640	/*
				641	* Ok this is tricky, when get_user_pages_fast() run it doesnt
				642	* take any lock, therefore the check that we are going to make
				643	* with the pagecount against the mapcount is racey and
				644	* O_DIRECT can happen right after the check.
				645	* So we clear the pte and flush the tlb before the check
				646	* this assure us that no O_DIRECT can happen after the check
				647	* or in the middle of the check.
				648	*/
				649	entry = ptep_clear_flush(vma, addr, ptep);
				650	/*
				651	* Check that no O_DIRECT or similar I/O is in progress on the
				652	* page
				653	*/
				654	if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
				655	set_pte_at_notify(mm, addr, ptep, entry);
				656	goto out_unlock;
				657	}
				658	entry = pte_wrprotect(entry);
				659	set_pte_at_notify(mm, addr, ptep, entry);
				660	}
				661	orig_pte = ptep;
				662	err = 0;
				663
				664	out_unlock:
				665	pte_unmap_unlock(ptep, ptl);
				666	out:
				667	return err;
				668	}
				669
				670	/**
				671	* replace_page - replace page in vma by new ksm page
				672	* @vma: vma that holds the pte pointing to oldpage
				673	* @oldpage: the page we are replacing by newpage
				674	* @newpage: the ksm page we replace oldpage by
				675	* @orig_pte: the original value of the pte
				676	*
				677	* Returns 0 on success, -EFAULT on failure.
				678	*/
				679	static int replace_page(struct vm_area_struct vma, struct page oldpage,
				680	struct page *newpage, pte_t orig_pte)
				681	{
				682	struct mm_struct *mm = vma->vm_mm;
				683	pgd_t *pgd;
				684	pud_t *pud;
				685	pmd_t *pmd;
				686	pte_t *ptep;
				687	spinlock_t *ptl;
				688	unsigned long addr;
				689	pgprot_t prot;
				690	int err = -EFAULT;
				691
				692	prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
				693
				694	addr = page_address_in_vma(oldpage, vma);
				695	if (addr == -EFAULT)
				696	goto out;
				697
				698	pgd = pgd_offset(mm, addr);
				699	if (!pgd_present(*pgd))
				700	goto out;
				701
				702	pud = pud_offset(pgd, addr);
				703	if (!pud_present(*pud))
				704	goto out;
				705
				706	pmd = pmd_offset(pud, addr);
				707	if (!pmd_present(*pmd))
				708	goto out;
				709
				710	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				711	if (!pte_same(*ptep, orig_pte)) {
				712	pte_unmap_unlock(ptep, ptl);
				713	goto out;
				714	}
				715
				716	get_page(newpage);
				717	page_add_ksm_rmap(newpage);
				718
				719	flush_cache_page(vma, addr, pte_pfn(*ptep));
				720	ptep_clear_flush(vma, addr, ptep);
				721	set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
				722
				723	page_remove_rmap(oldpage);
				724	put_page(oldpage);
				725
				726	pte_unmap_unlock(ptep, ptl);
				727	err = 0;
				728	out:
				729	return err;
				730	}
				731
				732	/*
				733	* try_to_merge_one_page - take two pages and merge them into one
				734	* @vma: the vma that hold the pte pointing into oldpage
				735	* @oldpage: the page that we want to replace with newpage
				736	* @newpage: the page that we want to map instead of oldpage
				737	*
				738	* Note:
				739	* oldpage should be a PageAnon page, while newpage should be a PageKsm page,
				740	* or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
				741	*
				742	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				743	*/
				744	static int try_to_merge_one_page(struct vm_area_struct *vma,
				745	struct page *oldpage,
				746	struct page *newpage)
				747	{
				748	pte_t orig_pte = __pte(0);
				749	int err = -EFAULT;
				750
				751	if (!(vma->vm_flags & VM_MERGEABLE))
				752	goto out;
				753
				754	if (!PageAnon(oldpage))
				755	goto out;
				756
				757	get_page(newpage);
				758	get_page(oldpage);
				759
				760	/*
				761	* We need the page lock to read a stable PageSwapCache in
				762	* write_protect_page(). We use trylock_page() instead of
				763	* lock_page() because we don't want to wait here - we
				764	* prefer to continue scanning and merging different pages,
				765	* then come back to this page when it is unlocked.
				766	*/
				767	if (!trylock_page(oldpage))
				768	goto out_putpage;
				769	/*
				770	* If this anonymous page is mapped only here, its pte may need
				771	* to be write-protected. If it's mapped elsewhere, all of its
				772	* ptes are necessarily already write-protected. But in either
				773	* case, we need to lock and check page_count is not raised.
				774	*/
				775	if (write_protect_page(vma, oldpage, &orig_pte)) {
				776	unlock_page(oldpage);
				777	goto out_putpage;
				778	}
				779	unlock_page(oldpage);
				780
				781	if (pages_identical(oldpage, newpage))
				782	err = replace_page(vma, oldpage, newpage, orig_pte);
				783
				784	out_putpage:
				785	put_page(oldpage);
				786	put_page(newpage);
				787	out:
				788	return err;
				789	}
				790
				791	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	792	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				793	* but no new kernel page is allocated: kpage must already be a ksm page.
				794	*/
				795	static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
				796	unsigned long addr1,
				797	struct page *page1,
				798	struct page *kpage)
				799	{
				800	struct vm_area_struct *vma;
				801	int err = -EFAULT;
				802
				803	down_read(&mm1->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	804	if (ksm_test_exit(mm1))
				805	goto out;
				806
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	807	vma = find_vma(mm1, addr1);
				808	if (!vma \|\| vma->vm_start > addr1)
				809	goto out;
				810
				811	err = try_to_merge_one_page(vma, page1, kpage);
				812	out:
				813	up_read(&mm1->mmap_sem);
				814	return err;
				815	}
				816
				817	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	818	* try_to_merge_two_pages - take two identical pages and prepare them
				819	* to be merged into one page.
				820	*
				821	* This function returns 0 if we successfully mapped two identical pages
				822	* into one page, -EFAULT otherwise.
				823	*
				824	* Note that this function allocates a new kernel page: if one of the pages
				825	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				826	*/
				827	static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
				828	struct page page1, struct mm_struct mm2,
				829	unsigned long addr2, struct page *page2)
				830	{
				831	struct vm_area_struct *vma;
				832	struct page *kpage;
				833	int err = -EFAULT;
				834
				835	/*
				836	* The number of nodes in the stable tree
				837	* is the number of kernel pages that we hold.
				838	*/
				839	if (ksm_max_kernel_pages &&
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	840	ksm_max_kernel_pages <= ksm_pages_shared)
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	841	return err;
				842
				843	kpage = alloc_page(GFP_HIGHUSER);
				844	if (!kpage)
				845	return err;
				846
				847	down_read(&mm1->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	848	if (ksm_test_exit(mm1)) {
				849	up_read(&mm1->mmap_sem);
				850	goto out;
				851	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	852	vma = find_vma(mm1, addr1);
				853	if (!vma \|\| vma->vm_start > addr1) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	854	up_read(&mm1->mmap_sem);
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	855	goto out;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	856	}
				857
				858	copy_user_highpage(kpage, page1, addr1, vma);
				859	err = try_to_merge_one_page(vma, page1, kpage);
				860	up_read(&mm1->mmap_sem);
				861
				862	if (!err) {
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	863	err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	864	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	865	* If that fails, we have a ksm page with only one pte
				866	* pointing to it: so break it.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	867	*/
				868	if (err)
				869	break_cow(mm1, addr1);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	870	}
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	871	out:
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	872	put_page(kpage);
				873	return err;
				874	}
				875
				876	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	877	* stable_tree_search - search page inside the stable tree
				878	* @page: the page that we are searching identical pages to.
				879	* @page2: pointer into identical page that we are holding inside the stable
				880	* tree that we have found.
				881	* @rmap_item: the reverse mapping item
				882	*
				883	* This function checks if there is a page inside the stable tree
				884	* with identical content to the page that we are scanning right now.
				885	*
				886	* This function return rmap_item pointer to the identical item if found,
				887	* NULL otherwise.
				888	*/
				889	static struct rmap_item stable_tree_search(struct page page,
				890	struct page **page2,
				891	struct rmap_item *rmap_item)
				892	{
				893	struct rb_node *node = root_stable_tree.rb_node;
				894
				895	while (node) {
				896	struct rmap_item tree_rmap_item, next_rmap_item;
				897	int ret;
				898
				899	tree_rmap_item = rb_entry(node, struct rmap_item, node);
				900	while (tree_rmap_item) {
				901	BUG_ON(!in_stable_tree(tree_rmap_item));
				902	cond_resched();
				903	page2[0] = get_ksm_page(tree_rmap_item);
				904	if (page2[0])
				905	break;
				906	next_rmap_item = tree_rmap_item->next;
				907	remove_rmap_item_from_tree(tree_rmap_item);
				908	tree_rmap_item = next_rmap_item;
				909	}
				910	if (!tree_rmap_item)
				911	return NULL;
				912
				913	ret = memcmp_pages(page, page2[0]);
				914
				915	if (ret < 0) {
				916	put_page(page2[0]);
				917	node = node->rb_left;
				918	} else if (ret > 0) {
				919	put_page(page2[0]);
				920	node = node->rb_right;
				921	} else {
				922	return tree_rmap_item;
				923	}
				924	}
				925
				926	return NULL;
				927	}
				928
				929	/*
				930	* stable_tree_insert - insert rmap_item pointing to new ksm page
				931	* into the stable tree.
				932	*
				933	* @page: the page that we are searching identical page to inside the stable
				934	* tree.
				935	* @rmap_item: pointer to the reverse mapping item.
				936	*
				937	* This function returns rmap_item if success, NULL otherwise.
				938	*/
				939	static struct rmap_item stable_tree_insert(struct page page,
				940	struct rmap_item *rmap_item)
				941	{
				942	struct rb_node **new = &root_stable_tree.rb_node;
				943	struct rb_node *parent = NULL;
				944
				945	while (*new) {
				946	struct rmap_item tree_rmap_item, next_rmap_item;
				947	struct page *tree_page;
				948	int ret;
				949
				950	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				951	while (tree_rmap_item) {
				952	BUG_ON(!in_stable_tree(tree_rmap_item));
				953	cond_resched();
				954	tree_page = get_ksm_page(tree_rmap_item);
				955	if (tree_page)
				956	break;
				957	next_rmap_item = tree_rmap_item->next;
				958	remove_rmap_item_from_tree(tree_rmap_item);
				959	tree_rmap_item = next_rmap_item;
				960	}
				961	if (!tree_rmap_item)
				962	return NULL;
				963
				964	ret = memcmp_pages(page, tree_page);
				965	put_page(tree_page);
				966
				967	parent = *new;
				968	if (ret < 0)
				969	new = &parent->rb_left;
				970	else if (ret > 0)
				971	new = &parent->rb_right;
				972	else {
				973	/*
				974	* It is not a bug that stable_tree_search() didn't
				975	* find this node: because at that time our page was
				976	* not yet write-protected, so may have changed since.
				977	*/
				978	return NULL;
				979	}
				980	}
				981
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	982	rmap_item->address \|= NODE_FLAG \| STABLE_FLAG;
				983	rmap_item->next = NULL;
				984	rb_link_node(&rmap_item->node, parent, new);
				985	rb_insert_color(&rmap_item->node, &root_stable_tree);
				986
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	987	ksm_pages_shared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	988	return rmap_item;
				989	}
				990
				991	/*
				992	* unstable_tree_search_insert - search and insert items into the unstable tree.
				993	*
				994	* @page: the page that we are going to search for identical page or to insert
				995	* into the unstable tree
				996	* @page2: pointer into identical page that was found inside the unstable tree
				997	* @rmap_item: the reverse mapping item of page
				998	*
				999	* This function searches for a page in the unstable tree identical to the
				1000	* page currently being scanned; and if no identical page is found in the
				1001	* tree, we insert rmap_item as a new object into the unstable tree.
				1002	*
				1003	* This function returns pointer to rmap_item found to be identical
				1004	* to the currently scanned page, NULL otherwise.
				1005	*
				1006	* This function does both searching and inserting, because they share
				1007	* the same walking algorithm in an rbtree.
				1008	*/
				1009	static struct rmap_item unstable_tree_search_insert(struct page page,
				1010	struct page **page2,
				1011	struct rmap_item *rmap_item)
				1012	{
				1013	struct rb_node **new = &root_unstable_tree.rb_node;
				1014	struct rb_node *parent = NULL;
				1015
				1016	while (*new) {
				1017	struct rmap_item *tree_rmap_item;
				1018	int ret;
				1019
				1020	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				1021	page2[0] = get_mergeable_page(tree_rmap_item);
				1022	if (!page2[0])
				1023	return NULL;
				1024
				1025	/*
				1026	* Don't substitute an unswappable ksm page
				1027	* just for one good swappable forked page.
				1028	*/
				1029	if (page == page2[0]) {
				1030	put_page(page2[0]);
				1031	return NULL;
				1032	}
				1033
				1034	ret = memcmp_pages(page, page2[0]);
				1035
				1036	parent = *new;
				1037	if (ret < 0) {
				1038	put_page(page2[0]);
				1039	new = &parent->rb_left;
				1040	} else if (ret > 0) {
				1041	put_page(page2[0]);
				1042	new = &parent->rb_right;
				1043	} else {
				1044	return tree_rmap_item;
				1045	}
				1046	}
				1047
				1048	rmap_item->address \|= NODE_FLAG;
				1049	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
				1050	rb_link_node(&rmap_item->node, parent, new);
				1051	rb_insert_color(&rmap_item->node, &root_unstable_tree);
				1052
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1053	ksm_pages_unshared++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1054	return NULL;
				1055	}
				1056
				1057	/*
				1058	* stable_tree_append - add another rmap_item to the linked list of
				1059	* rmap_items hanging off a given node of the stable tree, all sharing
				1060	* the same ksm page.
				1061	*/
				1062	static void stable_tree_append(struct rmap_item *rmap_item,
				1063	struct rmap_item *tree_rmap_item)
				1064	{
				1065	rmap_item->next = tree_rmap_item->next;
				1066	rmap_item->prev = tree_rmap_item;
				1067
				1068	if (tree_rmap_item->next)
				1069	tree_rmap_item->next->prev = rmap_item;
				1070
				1071	tree_rmap_item->next = rmap_item;
				1072	rmap_item->address \|= STABLE_FLAG;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1073
				1074	ksm_pages_sharing++;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1075	}
				1076
				1077	/*
Hugh Dickins	81464e30	2009-09-21 17:02:15 -0700	[diff] [blame]	1078	* cmp_and_merge_page - first see if page can be merged into the stable tree;
				1079	* if not, compare checksum to previous and if it's the same, see if page can
				1080	* be inserted into the unstable tree, or merged with a page already there and
				1081	* both transferred to the stable tree.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1082	*
				1083	* @page: the page that we are searching identical page to.
				1084	* @rmap_item: the reverse mapping into the virtual address of this page
				1085	*/
				1086	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				1087	{
				1088	struct page *page2[1];
				1089	struct rmap_item *tree_rmap_item;
				1090	unsigned int checksum;
				1091	int err;
				1092
				1093	if (in_stable_tree(rmap_item))
				1094	remove_rmap_item_from_tree(rmap_item);
				1095
				1096	/* We first start with searching the page inside the stable tree */
				1097	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
				1098	if (tree_rmap_item) {
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1099	if (page == page2[0]) /* forked */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1100	err = 0;
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1101	else
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1102	err = try_to_merge_with_ksm_page(rmap_item->mm,
				1103	rmap_item->address,
				1104	page, page2[0]);
				1105	put_page(page2[0]);
				1106
				1107	if (!err) {
				1108	/*
				1109	* The page was successfully merged:
				1110	* add its rmap_item to the stable tree.
				1111	*/
				1112	stable_tree_append(rmap_item, tree_rmap_item);
				1113	}
				1114	return;
				1115	}
				1116
				1117	/*
				1118	* A ksm page might have got here by fork, but its other
				1119	* references have already been removed from the stable tree.
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1120	* Or it might be left over from a break_ksm which failed
				1121	* when the mem_cgroup had reached its limit: try again now.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1122	*/
				1123	if (PageKsm(page))
				1124	break_cow(rmap_item->mm, rmap_item->address);
				1125
				1126	/*
				1127	* In case the hash value of the page was changed from the last time we
				1128	* have calculated it, this page to be changed frequely, therefore we
				1129	* don't want to insert it to the unstable tree, and we don't want to
				1130	* waste our time to search if there is something identical to it there.
				1131	*/
				1132	checksum = calc_checksum(page);
				1133	if (rmap_item->oldchecksum != checksum) {
				1134	rmap_item->oldchecksum = checksum;
				1135	return;
				1136	}
				1137
				1138	tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
				1139	if (tree_rmap_item) {
				1140	err = try_to_merge_two_pages(rmap_item->mm,
				1141	rmap_item->address, page,
				1142	tree_rmap_item->mm,
				1143	tree_rmap_item->address, page2[0]);
				1144	/*
				1145	* As soon as we merge this page, we want to remove the
				1146	* rmap_item of the page we have merged with from the unstable
				1147	* tree, and insert it instead as new node in the stable tree.
				1148	*/
				1149	if (!err) {
				1150	rb_erase(&tree_rmap_item->node, &root_unstable_tree);
				1151	tree_rmap_item->address &= ~NODE_FLAG;
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1152	ksm_pages_unshared--;
				1153
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1154	/*
				1155	* If we fail to insert the page into the stable tree,
				1156	* we will have 2 virtual addresses that are pointing
				1157	* to a ksm page left outside the stable tree,
				1158	* in which case we need to break_cow on both.
				1159	*/
				1160	if (stable_tree_insert(page2[0], tree_rmap_item))
				1161	stable_tree_append(rmap_item, tree_rmap_item);
				1162	else {
				1163	break_cow(tree_rmap_item->mm,
				1164	tree_rmap_item->address);
				1165	break_cow(rmap_item->mm, rmap_item->address);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1166	}
				1167	}
				1168
				1169	put_page(page2[0]);
				1170	}
				1171	}
				1172
				1173	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
				1174	struct list_head *cur,
				1175	unsigned long addr)
				1176	{
				1177	struct rmap_item *rmap_item;
				1178
				1179	while (cur != &mm_slot->rmap_list) {
				1180	rmap_item = list_entry(cur, struct rmap_item, link);
				1181	if ((rmap_item->address & PAGE_MASK) == addr) {
				1182	if (!in_stable_tree(rmap_item))
				1183	remove_rmap_item_from_tree(rmap_item);
				1184	return rmap_item;
				1185	}
				1186	if (rmap_item->address > addr)
				1187	break;
				1188	cur = cur->next;
				1189	remove_rmap_item_from_tree(rmap_item);
				1190	list_del(&rmap_item->link);
				1191	free_rmap_item(rmap_item);
				1192	}
				1193
				1194	rmap_item = alloc_rmap_item();
				1195	if (rmap_item) {
				1196	/* It has already been zeroed */
				1197	rmap_item->mm = mm_slot->mm;
				1198	rmap_item->address = addr;
				1199	list_add_tail(&rmap_item->link, cur);
				1200	}
				1201	return rmap_item;
				1202	}
				1203
				1204	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				1205	{
				1206	struct mm_struct *mm;
				1207	struct mm_slot *slot;
				1208	struct vm_area_struct *vma;
				1209	struct rmap_item *rmap_item;
				1210
				1211	if (list_empty(&ksm_mm_head.mm_list))
				1212	return NULL;
				1213
				1214	slot = ksm_scan.mm_slot;
				1215	if (slot == &ksm_mm_head) {
				1216	root_unstable_tree = RB_ROOT;
				1217
				1218	spin_lock(&ksm_mmlist_lock);
				1219	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1220	ksm_scan.mm_slot = slot;
				1221	spin_unlock(&ksm_mmlist_lock);
				1222	next_mm:
				1223	ksm_scan.address = 0;
				1224	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1225	struct rmap_item, link);
				1226	}
				1227
				1228	mm = slot->mm;
				1229	down_read(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1230	if (ksm_test_exit(mm))
				1231	vma = NULL;
				1232	else
				1233	vma = find_vma(mm, ksm_scan.address);
				1234
				1235	for (; vma; vma = vma->vm_next) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1236	if (!(vma->vm_flags & VM_MERGEABLE))
				1237	continue;
				1238	if (ksm_scan.address < vma->vm_start)
				1239	ksm_scan.address = vma->vm_start;
				1240	if (!vma->anon_vma)
				1241	ksm_scan.address = vma->vm_end;
				1242
				1243	while (ksm_scan.address < vma->vm_end) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1244	if (ksm_test_exit(mm))
				1245	break;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1246	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
				1247	if (page && PageAnon(page)) {
				1248	flush_anon_page(vma, *page, ksm_scan.address);
				1249	flush_dcache_page(*page);
				1250	rmap_item = get_next_rmap_item(slot,
				1251	ksm_scan.rmap_item->link.next,
				1252	ksm_scan.address);
				1253	if (rmap_item) {
				1254	ksm_scan.rmap_item = rmap_item;
				1255	ksm_scan.address += PAGE_SIZE;
				1256	} else
				1257	put_page(*page);
				1258	up_read(&mm->mmap_sem);
				1259	return rmap_item;
				1260	}
				1261	if (*page)
				1262	put_page(*page);
				1263	ksm_scan.address += PAGE_SIZE;
				1264	cond_resched();
				1265	}
				1266	}
				1267
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1268	if (ksm_test_exit(mm)) {
				1269	ksm_scan.address = 0;
				1270	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1271	struct rmap_item, link);
				1272	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1273	/*
				1274	* Nuke all the rmap_items that are above this current rmap:
				1275	* because there were no VM_MERGEABLE vmas with such addresses.
				1276	*/
				1277	remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1278
				1279	spin_lock(&ksm_mmlist_lock);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1280	ksm_scan.mm_slot = list_entry(slot->mm_list.next,
				1281	struct mm_slot, mm_list);
				1282	if (ksm_scan.address == 0) {
				1283	/*
				1284	* We've completed a full scan of all vmas, holding mmap_sem
				1285	* throughout, and found no VM_MERGEABLE: so do the same as
				1286	* __ksm_exit does to remove this mm from all our lists now.
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1287	* This applies either when cleaning up after __ksm_exit
				1288	* (but beware: we can reach here even before __ksm_exit),
				1289	* or when all VM_MERGEABLE areas have been unmapped (and
				1290	* mmap_sem then protects against race with MADV_MERGEABLE).
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1291	*/
				1292	hlist_del(&slot->link);
				1293	list_del(&slot->mm_list);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1294	spin_unlock(&ksm_mmlist_lock);
				1295
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1296	free_mm_slot(slot);
				1297	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1298	up_read(&mm->mmap_sem);
				1299	mmdrop(mm);
				1300	} else {
				1301	spin_unlock(&ksm_mmlist_lock);
				1302	up_read(&mm->mmap_sem);
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1303	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1304
				1305	/* Repeat until we've completed scanning the whole list */
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1306	slot = ksm_scan.mm_slot;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1307	if (slot != &ksm_mm_head)
				1308	goto next_mm;
				1309
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1310	ksm_scan.seqnr++;
				1311	return NULL;
				1312	}
				1313
				1314	/**
				1315	* ksm_do_scan - the ksm scanner main worker function.
				1316	* @scan_npages - number of pages we want to scan before we return.
				1317	*/
				1318	static void ksm_do_scan(unsigned int scan_npages)
				1319	{
				1320	struct rmap_item *rmap_item;
				1321	struct page *page;
				1322
				1323	while (scan_npages--) {
				1324	cond_resched();
				1325	rmap_item = scan_get_next_rmap_item(&page);
				1326	if (!rmap_item)
				1327	return;
				1328	if (!PageKsm(page) \|\| !in_stable_tree(rmap_item))
				1329	cmp_and_merge_page(page, rmap_item);
Hugh Dickins	26465d3	2009-09-21 17:02:12 -0700	[diff] [blame]	1330	else if (page_mapcount(page) == 1) {
				1331	/*
				1332	* Replace now-unshared ksm page by ordinary page.
				1333	*/
				1334	break_cow(rmap_item->mm, rmap_item->address);
				1335	remove_rmap_item_from_tree(rmap_item);
				1336	rmap_item->oldchecksum = calc_checksum(page);
				1337	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1338	put_page(page);
				1339	}
				1340	}
				1341
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1342	static int ksmd_should_run(void)
				1343	{
				1344	return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list);
				1345	}
				1346
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1347	static int ksm_scan_thread(void *nothing)
				1348	{
Izik Eidus	339aa62	2009-09-21 17:02:07 -0700	[diff] [blame]	1349	set_user_nice(current, 5);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1350
				1351	while (!kthread_should_stop()) {
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1352	mutex_lock(&ksm_thread_mutex);
				1353	if (ksmd_should_run())
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1354	ksm_do_scan(ksm_thread_pages_to_scan);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1355	mutex_unlock(&ksm_thread_mutex);
				1356
				1357	if (ksmd_should_run()) {
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1358	schedule_timeout_interruptible(
				1359	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				1360	} else {
				1361	wait_event_interruptible(ksm_thread_wait,
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1362	ksmd_should_run() \|\| kthread_should_stop());
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1363	}
				1364	}
				1365	return 0;
				1366	}
				1367
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1368	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				1369	unsigned long end, int advice, unsigned long *vm_flags)
				1370	{
				1371	struct mm_struct *mm = vma->vm_mm;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1372	int err;
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1373
				1374	switch (advice) {
				1375	case MADV_MERGEABLE:
				1376	/*
				1377	* Be somewhat over-protective for now!
				1378	*/
				1379	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				1380	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
				1381	VM_RESERVED \| VM_HUGETLB \| VM_INSERTPAGE \|
				1382	VM_MIXEDMAP \| VM_SAO))
				1383	return 0; /* just ignore the advice */
				1384
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1385	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
				1386	err = __ksm_enter(mm);
				1387	if (err)
				1388	return err;
				1389	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1390
				1391	*vm_flags \|= VM_MERGEABLE;
				1392	break;
				1393
				1394	case MADV_UNMERGEABLE:
				1395	if (!(*vm_flags & VM_MERGEABLE))
				1396	return 0; /* just ignore the advice */
				1397
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1398	if (vma->anon_vma) {
				1399	err = unmerge_ksm_pages(vma, start, end);
				1400	if (err)
				1401	return err;
				1402	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1403
				1404	*vm_flags &= ~VM_MERGEABLE;
				1405	break;
				1406	}
				1407
				1408	return 0;
				1409	}
				1410
				1411	int __ksm_enter(struct mm_struct *mm)
				1412	{
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1413	struct mm_slot *mm_slot;
				1414	int needs_wakeup;
				1415
				1416	mm_slot = alloc_mm_slot();
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1417	if (!mm_slot)
				1418	return -ENOMEM;
				1419
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1420	/* Check ksm_run too? Would need tighter locking */
				1421	needs_wakeup = list_empty(&ksm_mm_head.mm_list);
				1422
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1423	spin_lock(&ksm_mmlist_lock);
				1424	insert_to_mm_slots_hash(mm, mm_slot);
				1425	/*
				1426	* Insert just behind the scanning cursor, to let the area settle
				1427	* down a little; when fork is followed by immediate exec, we don't
				1428	* want ksmd to waste time setting up and tearing down an rmap_list.
				1429	*/
				1430	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
				1431	spin_unlock(&ksm_mmlist_lock);
				1432
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1433	set_bit(MMF_VM_MERGEABLE, &mm->flags);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1434	atomic_inc(&mm->mm_count);
Hugh Dickins	6e158384	2009-09-21 17:02:14 -0700	[diff] [blame]	1435
				1436	if (needs_wakeup)
				1437	wake_up_interruptible(&ksm_thread_wait);
				1438
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1439	return 0;
				1440	}
				1441
Andrea Arcangeli	1c2fb7a	2009-09-21 17:02:22 -0700	[diff] [blame]	1442	void __ksm_exit(struct mm_struct *mm)
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1443	{
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1444	struct mm_slot *mm_slot;
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1445	int easy_to_free = 0;
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1446
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1447	/*
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1448	* This process is exiting: if it's straightforward (as is the
				1449	* case when ksmd was never running), free mm_slot immediately.
				1450	* But if it's at the cursor or has rmap_items linked to it, use
				1451	* mmap_sem to synchronize with any break_cows before pagetables
				1452	* are freed, and leave the mm_slot on the list for ksmd to free.
				1453	* Beware: ksm may already have noticed it exiting and freed the slot.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1454	*/
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1455
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1456	spin_lock(&ksm_mmlist_lock);
				1457	mm_slot = get_mm_slot(mm);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1458	if (mm_slot && ksm_scan.mm_slot != mm_slot) {
				1459	if (list_empty(&mm_slot->rmap_list)) {
				1460	hlist_del(&mm_slot->link);
				1461	list_del(&mm_slot->mm_list);
				1462	easy_to_free = 1;
				1463	} else {
				1464	list_move(&mm_slot->mm_list,
				1465	&ksm_scan.mm_slot->mm_list);
				1466	}
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1467	}
Hugh Dickins	cd551f9	2009-09-21 17:02:17 -0700	[diff] [blame]	1468	spin_unlock(&ksm_mmlist_lock);
				1469
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1470	if (easy_to_free) {
				1471	free_mm_slot(mm_slot);
				1472	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				1473	mmdrop(mm);
				1474	} else if (mm_slot) {
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1475	down_write(&mm->mmap_sem);
				1476	up_write(&mm->mmap_sem);
Hugh Dickins	9ba6929	2009-09-21 17:02:20 -0700	[diff] [blame]	1477	}
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1478	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1479
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1480	#ifdef CONFIG_SYSFS
				1481	/*
				1482	* This all compiles without CONFIG_SYSFS, but is a waste of space.
				1483	*/
				1484
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1485	#define KSM_ATTR_RO(_name) \
				1486	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				1487	#define KSM_ATTR(_name) \
				1488	static struct kobj_attribute _name##_attr = \
				1489	__ATTR(_name, 0644, _name##_show, _name##_store)
				1490
				1491	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				1492	struct kobj_attribute attr, char buf)
				1493	{
				1494	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				1495	}
				1496
				1497	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				1498	struct kobj_attribute *attr,
				1499	const char *buf, size_t count)
				1500	{
				1501	unsigned long msecs;
				1502	int err;
				1503
				1504	err = strict_strtoul(buf, 10, &msecs);
				1505	if (err \|\| msecs > UINT_MAX)
				1506	return -EINVAL;
				1507
				1508	ksm_thread_sleep_millisecs = msecs;
				1509
				1510	return count;
				1511	}
				1512	KSM_ATTR(sleep_millisecs);
				1513
				1514	static ssize_t pages_to_scan_show(struct kobject *kobj,
				1515	struct kobj_attribute attr, char buf)
				1516	{
				1517	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				1518	}
				1519
				1520	static ssize_t pages_to_scan_store(struct kobject *kobj,
				1521	struct kobj_attribute *attr,
				1522	const char *buf, size_t count)
				1523	{
				1524	int err;
				1525	unsigned long nr_pages;
				1526
				1527	err = strict_strtoul(buf, 10, &nr_pages);
				1528	if (err \|\| nr_pages > UINT_MAX)
				1529	return -EINVAL;
				1530
				1531	ksm_thread_pages_to_scan = nr_pages;
				1532
				1533	return count;
				1534	}
				1535	KSM_ATTR(pages_to_scan);
				1536
				1537	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				1538	char *buf)
				1539	{
				1540	return sprintf(buf, "%u\n", ksm_run);
				1541	}
				1542
				1543	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				1544	const char *buf, size_t count)
				1545	{
				1546	int err;
				1547	unsigned long flags;
				1548
				1549	err = strict_strtoul(buf, 10, &flags);
				1550	if (err \|\| flags > UINT_MAX)
				1551	return -EINVAL;
				1552	if (flags > KSM_RUN_UNMERGE)
				1553	return -EINVAL;
				1554
				1555	/*
				1556	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				1557	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1558	* breaking COW to free the unswappable pages_shared (but leaves
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1559	* mm_slots on the list for when ksmd may be set running again).
				1560	*/
				1561
				1562	mutex_lock(&ksm_thread_mutex);
				1563	if (ksm_run != flags) {
				1564	ksm_run = flags;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1565	if (flags & KSM_RUN_UNMERGE) {
Hugh Dickins	35451be	2009-09-21 17:02:27 -0700	[diff] [blame]	1566	current->flags \|= PF_OOM_ORIGIN;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1567	err = unmerge_and_remove_all_rmap_items();
Hugh Dickins	35451be	2009-09-21 17:02:27 -0700	[diff] [blame]	1568	current->flags &= ~PF_OOM_ORIGIN;
Hugh Dickins	d952b79	2009-09-21 17:02:16 -0700	[diff] [blame]	1569	if (err) {
				1570	ksm_run = KSM_RUN_STOP;
				1571	count = err;
				1572	}
				1573	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1574	}
				1575	mutex_unlock(&ksm_thread_mutex);
				1576
				1577	if (flags & KSM_RUN_MERGE)
				1578	wake_up_interruptible(&ksm_thread_wait);
				1579
				1580	return count;
				1581	}
				1582	KSM_ATTR(run);
				1583
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1584	static ssize_t max_kernel_pages_store(struct kobject *kobj,
				1585	struct kobj_attribute *attr,
				1586	const char *buf, size_t count)
				1587	{
				1588	int err;
				1589	unsigned long nr_pages;
				1590
				1591	err = strict_strtoul(buf, 10, &nr_pages);
				1592	if (err)
				1593	return -EINVAL;
				1594
				1595	ksm_max_kernel_pages = nr_pages;
				1596
				1597	return count;
				1598	}
				1599
				1600	static ssize_t max_kernel_pages_show(struct kobject *kobj,
				1601	struct kobj_attribute attr, char buf)
				1602	{
				1603	return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
				1604	}
				1605	KSM_ATTR(max_kernel_pages);
				1606
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1607	static ssize_t pages_shared_show(struct kobject *kobj,
				1608	struct kobj_attribute attr, char buf)
				1609	{
				1610	return sprintf(buf, "%lu\n", ksm_pages_shared);
				1611	}
				1612	KSM_ATTR_RO(pages_shared);
				1613
				1614	static ssize_t pages_sharing_show(struct kobject *kobj,
				1615	struct kobj_attribute attr, char buf)
				1616	{
Hugh Dickins	e178dfd	2009-09-21 17:02:10 -0700	[diff] [blame]	1617	return sprintf(buf, "%lu\n", ksm_pages_sharing);
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1618	}
				1619	KSM_ATTR_RO(pages_sharing);
				1620
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1621	static ssize_t pages_unshared_show(struct kobject *kobj,
				1622	struct kobj_attribute attr, char buf)
				1623	{
				1624	return sprintf(buf, "%lu\n", ksm_pages_unshared);
				1625	}
				1626	KSM_ATTR_RO(pages_unshared);
				1627
				1628	static ssize_t pages_volatile_show(struct kobject *kobj,
				1629	struct kobj_attribute attr, char buf)
				1630	{
				1631	long ksm_pages_volatile;
				1632
				1633	ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
				1634	- ksm_pages_sharing - ksm_pages_unshared;
				1635	/*
				1636	* It was not worth any locking to calculate that statistic,
				1637	* but it might therefore sometimes be negative: conceal that.
				1638	*/
				1639	if (ksm_pages_volatile < 0)
				1640	ksm_pages_volatile = 0;
				1641	return sprintf(buf, "%ld\n", ksm_pages_volatile);
				1642	}
				1643	KSM_ATTR_RO(pages_volatile);
				1644
				1645	static ssize_t full_scans_show(struct kobject *kobj,
				1646	struct kobj_attribute attr, char buf)
				1647	{
				1648	return sprintf(buf, "%lu\n", ksm_scan.seqnr);
				1649	}
				1650	KSM_ATTR_RO(full_scans);
				1651
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1652	static struct attribute *ksm_attrs[] = {
				1653	&sleep_millisecs_attr.attr,
				1654	&pages_to_scan_attr.attr,
				1655	&run_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1656	&max_kernel_pages_attr.attr,
Hugh Dickins	b402826	2009-09-21 17:02:09 -0700	[diff] [blame]	1657	&pages_shared_attr.attr,
				1658	&pages_sharing_attr.attr,
Hugh Dickins	473b0ce	2009-09-21 17:02:11 -0700	[diff] [blame]	1659	&pages_unshared_attr.attr,
				1660	&pages_volatile_attr.attr,
				1661	&full_scans_attr.attr,
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1662	NULL,
				1663	};
				1664
				1665	static struct attribute_group ksm_attr_group = {
				1666	.attrs = ksm_attrs,
				1667	.name = "ksm",
				1668	};
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1669	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1670
				1671	static int __init ksm_init(void)
				1672	{
				1673	struct task_struct *ksm_thread;
				1674	int err;
				1675
Izik Eidus	2c6854f	2009-09-23 15:56:04 -0700	[diff] [blame^]	1676	ksm_init_max_kernel_pages();
				1677
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1678	err = ksm_slab_init();
				1679	if (err)
				1680	goto out;
				1681
				1682	err = mm_slots_hash_init();
				1683	if (err)
				1684	goto out_free1;
				1685
				1686	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				1687	if (IS_ERR(ksm_thread)) {
				1688	printk(KERN_ERR "ksm: creating kthread failed\n");
				1689	err = PTR_ERR(ksm_thread);
				1690	goto out_free2;
				1691	}
				1692
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1693	#ifdef CONFIG_SYSFS
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1694	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				1695	if (err) {
				1696	printk(KERN_ERR "ksm: register sysfs failed\n");
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1697	kthread_stop(ksm_thread);
				1698	goto out_free2;
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1699	}
Hugh Dickins	2ffd867	2009-09-21 17:02:23 -0700	[diff] [blame]	1700	#endif /* CONFIG_SYSFS */
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1701
				1702	return 0;
				1703
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1704	out_free2:
				1705	mm_slots_hash_free();
				1706	out_free1:
				1707	ksm_slab_free();
				1708	out:
				1709	return err;
				1710	}
				1711	module_init(ksm_init)