Blame - mm/ksm.c - kernel/msm-4.9

blob: cf072c54df3297dbe956921a92d27a82543cbd86 [file] [log] [blame]

Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1	/*
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	2	* Memory merging support.
				3	*
				4	* This code enables dynamic sharing of identical pages found in different
				5	* memory areas, even if they are not shared by fork()
				6	*
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	7	* Copyright (C) 2008-2009 Red Hat, Inc.
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	8	* Authors:
				9	* Izik Eidus
				10	* Andrea Arcangeli
				11	* Chris Wright
Izik Eidus	36b2528	2009-09-21 17:02:06 -0700	[diff] [blame]	12	* Hugh Dickins
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	13	*
				14	* This work is licensed under the terms of the GNU GPL, version 2.
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	15	*/
				16
				17	#include <linux/errno.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	18	#include <linux/mm.h>
				19	#include <linux/fs.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	20	#include <linux/mman.h>
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	21	#include <linux/sched.h>
				22	#include <linux/rwsem.h>
				23	#include <linux/pagemap.h>
				24	#include <linux/rmap.h>
				25	#include <linux/spinlock.h>
				26	#include <linux/jhash.h>
				27	#include <linux/delay.h>
				28	#include <linux/kthread.h>
				29	#include <linux/wait.h>
				30	#include <linux/slab.h>
				31	#include <linux/rbtree.h>
				32	#include <linux/mmu_notifier.h>
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	33	#include <linux/ksm.h>
				34
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	35	#include <asm/tlbflush.h>
				36
				37	/*
				38	* A few notes about the KSM scanning process,
				39	* to make it easier to understand the data structures below:
				40	*
				41	* In order to reduce excessive scanning, KSM sorts the memory pages by their
				42	* contents into a data structure that holds pointers to the pages' locations.
				43	*
				44	* Since the contents of the pages may change at any moment, KSM cannot just
				45	* insert the pages into a normal sorted tree and expect it to find anything.
				46	* Therefore KSM uses two data structures - the stable and the unstable tree.
				47	*
				48	* The stable tree holds pointers to all the merged pages (ksm pages), sorted
				49	* by their contents. Because each such page is write-protected, searching on
				50	* this tree is fully assured to be working (except when pages are unmapped),
				51	* and therefore this tree is called the stable tree.
				52	*
				53	* In addition to the stable tree, KSM uses a second data structure called the
				54	* unstable tree: this tree holds pointers to pages which have been found to
				55	* be "unchanged for a period of time". The unstable tree sorts these pages
				56	* by their contents, but since they are not write-protected, KSM cannot rely
				57	* upon the unstable tree to work correctly - the unstable tree is liable to
				58	* be corrupted as its contents are modified, and so it is called unstable.
				59	*
				60	* KSM solves this problem by several techniques:
				61	*
				62	* 1) The unstable tree is flushed every time KSM completes scanning all
				63	* memory areas, and then the tree is rebuilt again from the beginning.
				64	* 2) KSM will only insert into the unstable tree, pages whose hash value
				65	* has not changed since the previous scan of all memory areas.
				66	* 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
				67	* colors of the nodes and not on their contents, assuring that even when
				68	* the tree gets "corrupted" it won't get out of balance, so scanning time
				69	* remains the same (also, searching and inserting nodes in an rbtree uses
				70	* the same algorithm, so we have no overhead when we flush and rebuild).
				71	* 4) KSM never flushes the stable tree, which means that even if it were to
				72	* take 10 attempts to find a page in the unstable tree, once it is found,
				73	* it is secured in the stable tree. (When we scan a new page, we first
				74	* compare it against the stable tree, and then against the unstable tree.)
				75	*/
				76
				77	/**
				78	* struct mm_slot - ksm information per mm that is being scanned
				79	* @link: link to the mm_slots hash list
				80	* @mm_list: link into the mm_slots list, rooted in ksm_mm_head
				81	* @rmap_list: head for this mm_slot's list of rmap_items
				82	* @mm: the mm that this information is valid for
				83	*/
				84	struct mm_slot {
				85	struct hlist_node link;
				86	struct list_head mm_list;
				87	struct list_head rmap_list;
				88	struct mm_struct *mm;
				89	};
				90
				91	/**
				92	* struct ksm_scan - cursor for scanning
				93	* @mm_slot: the current mm_slot we are scanning
				94	* @address: the next address inside that to be scanned
				95	* @rmap_item: the current rmap that we are scanning inside the rmap_list
				96	* @seqnr: count of completed full scans (needed when removing unstable node)
				97	*
				98	* There is only the one ksm_scan instance of this cursor structure.
				99	*/
				100	struct ksm_scan {
				101	struct mm_slot *mm_slot;
				102	unsigned long address;
				103	struct rmap_item *rmap_item;
				104	unsigned long seqnr;
				105	};
				106
				107	/**
				108	* struct rmap_item - reverse mapping item for virtual addresses
				109	* @link: link into mm_slot's rmap_list (rmap_list is per mm)
				110	* @mm: the memory structure this rmap_item is pointing into
				111	* @address: the virtual address this rmap_item tracks (+ flags in low bits)
				112	* @oldchecksum: previous checksum of the page at that virtual address
				113	* @node: rb_node of this rmap_item in either unstable or stable tree
				114	* @next: next rmap_item hanging off the same node of the stable tree
				115	* @prev: previous rmap_item hanging off the same node of the stable tree
				116	*/
				117	struct rmap_item {
				118	struct list_head link;
				119	struct mm_struct *mm;
				120	unsigned long address; /* + low bits used for flags below */
				121	union {
				122	unsigned int oldchecksum; /* when unstable */
				123	struct rmap_item next; / when stable */
				124	};
				125	union {
				126	struct rb_node node; /* when tree node */
				127	struct rmap_item prev; / in stable list */
				128	};
				129	};
				130
				131	#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */
				132	#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */
				133	#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */
				134
				135	/* The stable and unstable tree heads */
				136	static struct rb_root root_stable_tree = RB_ROOT;
				137	static struct rb_root root_unstable_tree = RB_ROOT;
				138
				139	#define MM_SLOTS_HASH_HEADS 1024
				140	static struct hlist_head *mm_slots_hash;
				141
				142	static struct mm_slot ksm_mm_head = {
				143	.mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
				144	};
				145	static struct ksm_scan ksm_scan = {
				146	.mm_slot = &ksm_mm_head,
				147	};
				148
				149	static struct kmem_cache *rmap_item_cache;
				150	static struct kmem_cache *mm_slot_cache;
				151
				152	/* The number of nodes in the stable tree */
				153	static unsigned long ksm_kernel_pages_allocated;
				154
				155	/* The number of page slots sharing those nodes */
				156	static unsigned long ksm_pages_shared;
				157
				158	/* Limit on the number of unswappable pages used */
				159	static unsigned long ksm_max_kernel_pages;
				160
				161	/* Number of pages ksmd should scan in one batch */
				162	static unsigned int ksm_thread_pages_to_scan;
				163
				164	/* Milliseconds ksmd should sleep between batches */
				165	static unsigned int ksm_thread_sleep_millisecs;
				166
				167	#define KSM_RUN_STOP 0
				168	#define KSM_RUN_MERGE 1
				169	#define KSM_RUN_UNMERGE 2
				170	static unsigned int ksm_run;
				171
				172	static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
				173	static DEFINE_MUTEX(ksm_thread_mutex);
				174	static DEFINE_SPINLOCK(ksm_mmlist_lock);
				175
				176	#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\
				177	sizeof(struct __struct), __alignof__(struct __struct),\
				178	(__flags), NULL)
				179
				180	static int __init ksm_slab_init(void)
				181	{
				182	rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0);
				183	if (!rmap_item_cache)
				184	goto out;
				185
				186	mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
				187	if (!mm_slot_cache)
				188	goto out_free;
				189
				190	return 0;
				191
				192	out_free:
				193	kmem_cache_destroy(rmap_item_cache);
				194	out:
				195	return -ENOMEM;
				196	}
				197
				198	static void __init ksm_slab_free(void)
				199	{
				200	kmem_cache_destroy(mm_slot_cache);
				201	kmem_cache_destroy(rmap_item_cache);
				202	mm_slot_cache = NULL;
				203	}
				204
				205	static inline struct rmap_item *alloc_rmap_item(void)
				206	{
				207	return kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
				208	}
				209
				210	static inline void free_rmap_item(struct rmap_item *rmap_item)
				211	{
				212	rmap_item->mm = NULL; /* debug safety */
				213	kmem_cache_free(rmap_item_cache, rmap_item);
				214	}
				215
				216	static inline struct mm_slot *alloc_mm_slot(void)
				217	{
				218	if (!mm_slot_cache) /* initialization failed */
				219	return NULL;
				220	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
				221	}
				222
				223	static inline void free_mm_slot(struct mm_slot *mm_slot)
				224	{
				225	kmem_cache_free(mm_slot_cache, mm_slot);
				226	}
				227
				228	static int __init mm_slots_hash_init(void)
				229	{
				230	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
				231	GFP_KERNEL);
				232	if (!mm_slots_hash)
				233	return -ENOMEM;
				234	return 0;
				235	}
				236
				237	static void __init mm_slots_hash_free(void)
				238	{
				239	kfree(mm_slots_hash);
				240	}
				241
				242	static struct mm_slot get_mm_slot(struct mm_struct mm)
				243	{
				244	struct mm_slot *mm_slot;
				245	struct hlist_head *bucket;
				246	struct hlist_node *node;
				247
				248	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				249	% MM_SLOTS_HASH_HEADS];
				250	hlist_for_each_entry(mm_slot, node, bucket, link) {
				251	if (mm == mm_slot->mm)
				252	return mm_slot;
				253	}
				254	return NULL;
				255	}
				256
				257	static void insert_to_mm_slots_hash(struct mm_struct *mm,
				258	struct mm_slot *mm_slot)
				259	{
				260	struct hlist_head *bucket;
				261
				262	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
				263	% MM_SLOTS_HASH_HEADS];
				264	mm_slot->mm = mm;
				265	INIT_LIST_HEAD(&mm_slot->rmap_list);
				266	hlist_add_head(&mm_slot->link, bucket);
				267	}
				268
				269	static inline int in_stable_tree(struct rmap_item *rmap_item)
				270	{
				271	return rmap_item->address & STABLE_FLAG;
				272	}
				273
				274	/*
				275	* We use break_ksm to break COW on a ksm page: it's a stripped down
				276	*
				277	* if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
				278	* put_page(page);
				279	*
				280	* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
				281	* in case the application has unmapped and remapped mm,addr meanwhile.
				282	* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
				283	* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
				284	*/
				285	static void break_ksm(struct vm_area_struct *vma, unsigned long addr)
				286	{
				287	struct page *page;
				288	int ret;
				289
				290	do {
				291	cond_resched();
				292	page = follow_page(vma, addr, FOLL_GET);
				293	if (!page)
				294	break;
				295	if (PageKsm(page))
				296	ret = handle_mm_fault(vma->vm_mm, vma, addr,
				297	FAULT_FLAG_WRITE);
				298	else
				299	ret = VM_FAULT_WRITE;
				300	put_page(page);
				301	} while (!(ret & (VM_FAULT_WRITE \| VM_FAULT_SIGBUS)));
				302
				303	/* Which leaves us looping there if VM_FAULT_OOM: hmmm... */
				304	}
				305
				306	static void __break_cow(struct mm_struct *mm, unsigned long addr)
				307	{
				308	struct vm_area_struct *vma;
				309
				310	vma = find_vma(mm, addr);
				311	if (!vma \|\| vma->vm_start > addr)
				312	return;
				313	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				314	return;
				315	break_ksm(vma, addr);
				316	}
				317
				318	static void break_cow(struct mm_struct *mm, unsigned long addr)
				319	{
				320	down_read(&mm->mmap_sem);
				321	__break_cow(mm, addr);
				322	up_read(&mm->mmap_sem);
				323	}
				324
				325	static struct page get_mergeable_page(struct rmap_item rmap_item)
				326	{
				327	struct mm_struct *mm = rmap_item->mm;
				328	unsigned long addr = rmap_item->address;
				329	struct vm_area_struct *vma;
				330	struct page *page;
				331
				332	down_read(&mm->mmap_sem);
				333	vma = find_vma(mm, addr);
				334	if (!vma \|\| vma->vm_start > addr)
				335	goto out;
				336	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				337	goto out;
				338
				339	page = follow_page(vma, addr, FOLL_GET);
				340	if (!page)
				341	goto out;
				342	if (PageAnon(page)) {
				343	flush_anon_page(vma, page, addr);
				344	flush_dcache_page(page);
				345	} else {
				346	put_page(page);
				347	out: page = NULL;
				348	}
				349	up_read(&mm->mmap_sem);
				350	return page;
				351	}
				352
				353	/*
				354	* get_ksm_page: checks if the page at the virtual address in rmap_item
				355	* is still PageKsm, in which case we can trust the content of the page,
				356	* and it returns the gotten page; but NULL if the page has been zapped.
				357	*/
				358	static struct page get_ksm_page(struct rmap_item rmap_item)
				359	{
				360	struct page *page;
				361
				362	page = get_mergeable_page(rmap_item);
				363	if (page && !PageKsm(page)) {
				364	put_page(page);
				365	page = NULL;
				366	}
				367	return page;
				368	}
				369
				370	/*
				371	* Removing rmap_item from stable or unstable tree.
				372	* This function will clean the information from the stable/unstable tree.
				373	*/
				374	static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
				375	{
				376	if (in_stable_tree(rmap_item)) {
				377	struct rmap_item *next_item = rmap_item->next;
				378
				379	if (rmap_item->address & NODE_FLAG) {
				380	if (next_item) {
				381	rb_replace_node(&rmap_item->node,
				382	&next_item->node,
				383	&root_stable_tree);
				384	next_item->address \|= NODE_FLAG;
				385	} else {
				386	rb_erase(&rmap_item->node, &root_stable_tree);
				387	ksm_kernel_pages_allocated--;
				388	}
				389	} else {
				390	struct rmap_item *prev_item = rmap_item->prev;
				391
				392	BUG_ON(prev_item->next != rmap_item);
				393	prev_item->next = next_item;
				394	if (next_item) {
				395	BUG_ON(next_item->prev != rmap_item);
				396	next_item->prev = rmap_item->prev;
				397	}
				398	}
				399
				400	rmap_item->next = NULL;
				401	ksm_pages_shared--;
				402
				403	} else if (rmap_item->address & NODE_FLAG) {
				404	unsigned char age;
				405	/*
				406	* ksm_thread can and must skip the rb_erase, because
				407	* root_unstable_tree was already reset to RB_ROOT.
				408	* But __ksm_exit has to be careful: do the rb_erase
				409	* if it's interrupting a scan, and this rmap_item was
				410	* inserted by this scan rather than left from before.
				411	*
				412	* Because of the case in which remove_mm_from_lists
				413	* increments seqnr before removing rmaps, unstable_nr
				414	* may even be 2 behind seqnr, but should never be
				415	* further behind. Yes, I did have trouble with this!
				416	*/
				417	age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
				418	BUG_ON(age > 2);
				419	if (!age)
				420	rb_erase(&rmap_item->node, &root_unstable_tree);
				421	}
				422
				423	rmap_item->address &= PAGE_MASK;
				424
				425	cond_resched(); /* we're called from many long loops */
				426	}
				427
				428	static void remove_all_slot_rmap_items(struct mm_slot *mm_slot)
				429	{
				430	struct rmap_item rmap_item, node;
				431
				432	list_for_each_entry_safe(rmap_item, node, &mm_slot->rmap_list, link) {
				433	remove_rmap_item_from_tree(rmap_item);
				434	list_del(&rmap_item->link);
				435	free_rmap_item(rmap_item);
				436	}
				437	}
				438
				439	static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
				440	struct list_head *cur)
				441	{
				442	struct rmap_item *rmap_item;
				443
				444	while (cur != &mm_slot->rmap_list) {
				445	rmap_item = list_entry(cur, struct rmap_item, link);
				446	cur = cur->next;
				447	remove_rmap_item_from_tree(rmap_item);
				448	list_del(&rmap_item->link);
				449	free_rmap_item(rmap_item);
				450	}
				451	}
				452
				453	/*
				454	* Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
				455	* than check every pte of a given vma, the locking doesn't quite work for
				456	* that - an rmap_item is assigned to the stable tree after inserting ksm
				457	* page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
				458	* rmap_items from parent to child at fork time (so as not to waste time
				459	* if exit comes before the next scan reaches it).
				460	*/
				461	static void unmerge_ksm_pages(struct vm_area_struct *vma,
				462	unsigned long start, unsigned long end)
				463	{
				464	unsigned long addr;
				465
				466	for (addr = start; addr < end; addr += PAGE_SIZE)
				467	break_ksm(vma, addr);
				468	}
				469
				470	static void unmerge_and_remove_all_rmap_items(void)
				471	{
				472	struct mm_slot *mm_slot;
				473	struct mm_struct *mm;
				474	struct vm_area_struct *vma;
				475
				476	list_for_each_entry(mm_slot, &ksm_mm_head.mm_list, mm_list) {
				477	mm = mm_slot->mm;
				478	down_read(&mm->mmap_sem);
				479	for (vma = mm->mmap; vma; vma = vma->vm_next) {
				480	if (!(vma->vm_flags & VM_MERGEABLE) \|\| !vma->anon_vma)
				481	continue;
				482	unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
				483	}
				484	remove_all_slot_rmap_items(mm_slot);
				485	up_read(&mm->mmap_sem);
				486	}
				487
				488	spin_lock(&ksm_mmlist_lock);
				489	if (ksm_scan.mm_slot != &ksm_mm_head) {
				490	ksm_scan.mm_slot = &ksm_mm_head;
				491	ksm_scan.seqnr++;
				492	}
				493	spin_unlock(&ksm_mmlist_lock);
				494	}
				495
				496	static void remove_mm_from_lists(struct mm_struct *mm)
				497	{
				498	struct mm_slot *mm_slot;
				499
				500	spin_lock(&ksm_mmlist_lock);
				501	mm_slot = get_mm_slot(mm);
				502
				503	/*
				504	* This mm_slot is always at the scanning cursor when we're
				505	* called from scan_get_next_rmap_item; but it's a special
				506	* case when we're called from __ksm_exit.
				507	*/
				508	if (ksm_scan.mm_slot == mm_slot) {
				509	ksm_scan.mm_slot = list_entry(
				510	mm_slot->mm_list.next, struct mm_slot, mm_list);
				511	ksm_scan.address = 0;
				512	ksm_scan.rmap_item = list_entry(
				513	&ksm_scan.mm_slot->rmap_list, struct rmap_item, link);
				514	if (ksm_scan.mm_slot == &ksm_mm_head)
				515	ksm_scan.seqnr++;
				516	}
				517
				518	hlist_del(&mm_slot->link);
				519	list_del(&mm_slot->mm_list);
				520	spin_unlock(&ksm_mmlist_lock);
				521
				522	remove_all_slot_rmap_items(mm_slot);
				523	free_mm_slot(mm_slot);
				524	clear_bit(MMF_VM_MERGEABLE, &mm->flags);
				525	}
				526
				527	static u32 calc_checksum(struct page *page)
				528	{
				529	u32 checksum;
				530	void *addr = kmap_atomic(page, KM_USER0);
				531	checksum = jhash2(addr, PAGE_SIZE / 4, 17);
				532	kunmap_atomic(addr, KM_USER0);
				533	return checksum;
				534	}
				535
				536	static int memcmp_pages(struct page page1, struct page page2)
				537	{
				538	char addr1, addr2;
				539	int ret;
				540
				541	addr1 = kmap_atomic(page1, KM_USER0);
				542	addr2 = kmap_atomic(page2, KM_USER1);
				543	ret = memcmp(addr1, addr2, PAGE_SIZE);
				544	kunmap_atomic(addr2, KM_USER1);
				545	kunmap_atomic(addr1, KM_USER0);
				546	return ret;
				547	}
				548
				549	static inline int pages_identical(struct page page1, struct page page2)
				550	{
				551	return !memcmp_pages(page1, page2);
				552	}
				553
				554	static int write_protect_page(struct vm_area_struct vma, struct page page,
				555	pte_t *orig_pte)
				556	{
				557	struct mm_struct *mm = vma->vm_mm;
				558	unsigned long addr;
				559	pte_t *ptep;
				560	spinlock_t *ptl;
				561	int swapped;
				562	int err = -EFAULT;
				563
				564	addr = page_address_in_vma(page, vma);
				565	if (addr == -EFAULT)
				566	goto out;
				567
				568	ptep = page_check_address(page, mm, addr, &ptl, 0);
				569	if (!ptep)
				570	goto out;
				571
				572	if (pte_write(*ptep)) {
				573	pte_t entry;
				574
				575	swapped = PageSwapCache(page);
				576	flush_cache_page(vma, addr, page_to_pfn(page));
				577	/*
				578	* Ok this is tricky, when get_user_pages_fast() run it doesnt
				579	* take any lock, therefore the check that we are going to make
				580	* with the pagecount against the mapcount is racey and
				581	* O_DIRECT can happen right after the check.
				582	* So we clear the pte and flush the tlb before the check
				583	* this assure us that no O_DIRECT can happen after the check
				584	* or in the middle of the check.
				585	*/
				586	entry = ptep_clear_flush(vma, addr, ptep);
				587	/*
				588	* Check that no O_DIRECT or similar I/O is in progress on the
				589	* page
				590	*/
				591	if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
				592	set_pte_at_notify(mm, addr, ptep, entry);
				593	goto out_unlock;
				594	}
				595	entry = pte_wrprotect(entry);
				596	set_pte_at_notify(mm, addr, ptep, entry);
				597	}
				598	orig_pte = ptep;
				599	err = 0;
				600
				601	out_unlock:
				602	pte_unmap_unlock(ptep, ptl);
				603	out:
				604	return err;
				605	}
				606
				607	/**
				608	* replace_page - replace page in vma by new ksm page
				609	* @vma: vma that holds the pte pointing to oldpage
				610	* @oldpage: the page we are replacing by newpage
				611	* @newpage: the ksm page we replace oldpage by
				612	* @orig_pte: the original value of the pte
				613	*
				614	* Returns 0 on success, -EFAULT on failure.
				615	*/
				616	static int replace_page(struct vm_area_struct vma, struct page oldpage,
				617	struct page *newpage, pte_t orig_pte)
				618	{
				619	struct mm_struct *mm = vma->vm_mm;
				620	pgd_t *pgd;
				621	pud_t *pud;
				622	pmd_t *pmd;
				623	pte_t *ptep;
				624	spinlock_t *ptl;
				625	unsigned long addr;
				626	pgprot_t prot;
				627	int err = -EFAULT;
				628
				629	prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
				630
				631	addr = page_address_in_vma(oldpage, vma);
				632	if (addr == -EFAULT)
				633	goto out;
				634
				635	pgd = pgd_offset(mm, addr);
				636	if (!pgd_present(*pgd))
				637	goto out;
				638
				639	pud = pud_offset(pgd, addr);
				640	if (!pud_present(*pud))
				641	goto out;
				642
				643	pmd = pmd_offset(pud, addr);
				644	if (!pmd_present(*pmd))
				645	goto out;
				646
				647	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
				648	if (!pte_same(*ptep, orig_pte)) {
				649	pte_unmap_unlock(ptep, ptl);
				650	goto out;
				651	}
				652
				653	get_page(newpage);
				654	page_add_ksm_rmap(newpage);
				655
				656	flush_cache_page(vma, addr, pte_pfn(*ptep));
				657	ptep_clear_flush(vma, addr, ptep);
				658	set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
				659
				660	page_remove_rmap(oldpage);
				661	put_page(oldpage);
				662
				663	pte_unmap_unlock(ptep, ptl);
				664	err = 0;
				665	out:
				666	return err;
				667	}
				668
				669	/*
				670	* try_to_merge_one_page - take two pages and merge them into one
				671	* @vma: the vma that hold the pte pointing into oldpage
				672	* @oldpage: the page that we want to replace with newpage
				673	* @newpage: the page that we want to map instead of oldpage
				674	*
				675	* Note:
				676	* oldpage should be a PageAnon page, while newpage should be a PageKsm page,
				677	* or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
				678	*
				679	* This function returns 0 if the pages were merged, -EFAULT otherwise.
				680	*/
				681	static int try_to_merge_one_page(struct vm_area_struct *vma,
				682	struct page *oldpage,
				683	struct page *newpage)
				684	{
				685	pte_t orig_pte = __pte(0);
				686	int err = -EFAULT;
				687
				688	if (!(vma->vm_flags & VM_MERGEABLE))
				689	goto out;
				690
				691	if (!PageAnon(oldpage))
				692	goto out;
				693
				694	get_page(newpage);
				695	get_page(oldpage);
				696
				697	/*
				698	* We need the page lock to read a stable PageSwapCache in
				699	* write_protect_page(). We use trylock_page() instead of
				700	* lock_page() because we don't want to wait here - we
				701	* prefer to continue scanning and merging different pages,
				702	* then come back to this page when it is unlocked.
				703	*/
				704	if (!trylock_page(oldpage))
				705	goto out_putpage;
				706	/*
				707	* If this anonymous page is mapped only here, its pte may need
				708	* to be write-protected. If it's mapped elsewhere, all of its
				709	* ptes are necessarily already write-protected. But in either
				710	* case, we need to lock and check page_count is not raised.
				711	*/
				712	if (write_protect_page(vma, oldpage, &orig_pte)) {
				713	unlock_page(oldpage);
				714	goto out_putpage;
				715	}
				716	unlock_page(oldpage);
				717
				718	if (pages_identical(oldpage, newpage))
				719	err = replace_page(vma, oldpage, newpage, orig_pte);
				720
				721	out_putpage:
				722	put_page(oldpage);
				723	put_page(newpage);
				724	out:
				725	return err;
				726	}
				727
				728	/*
				729	* try_to_merge_two_pages - take two identical pages and prepare them
				730	* to be merged into one page.
				731	*
				732	* This function returns 0 if we successfully mapped two identical pages
				733	* into one page, -EFAULT otherwise.
				734	*
				735	* Note that this function allocates a new kernel page: if one of the pages
				736	* is already a ksm page, try_to_merge_with_ksm_page should be used.
				737	*/
				738	static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
				739	struct page page1, struct mm_struct mm2,
				740	unsigned long addr2, struct page *page2)
				741	{
				742	struct vm_area_struct *vma;
				743	struct page *kpage;
				744	int err = -EFAULT;
				745
				746	/*
				747	* The number of nodes in the stable tree
				748	* is the number of kernel pages that we hold.
				749	*/
				750	if (ksm_max_kernel_pages &&
				751	ksm_max_kernel_pages <= ksm_kernel_pages_allocated)
				752	return err;
				753
				754	kpage = alloc_page(GFP_HIGHUSER);
				755	if (!kpage)
				756	return err;
				757
				758	down_read(&mm1->mmap_sem);
				759	vma = find_vma(mm1, addr1);
				760	if (!vma \|\| vma->vm_start > addr1) {
				761	put_page(kpage);
				762	up_read(&mm1->mmap_sem);
				763	return err;
				764	}
				765
				766	copy_user_highpage(kpage, page1, addr1, vma);
				767	err = try_to_merge_one_page(vma, page1, kpage);
				768	up_read(&mm1->mmap_sem);
				769
				770	if (!err) {
				771	down_read(&mm2->mmap_sem);
				772	vma = find_vma(mm2, addr2);
				773	if (!vma \|\| vma->vm_start > addr2) {
				774	put_page(kpage);
				775	up_read(&mm2->mmap_sem);
				776	break_cow(mm1, addr1);
				777	return -EFAULT;
				778	}
				779
				780	err = try_to_merge_one_page(vma, page2, kpage);
				781	up_read(&mm2->mmap_sem);
				782
				783	/*
				784	* If the second try_to_merge_one_page failed, we have a
				785	* ksm page with just one pte pointing to it, so break it.
				786	*/
				787	if (err)
				788	break_cow(mm1, addr1);
				789	else
				790	ksm_pages_shared += 2;
				791	}
				792
				793	put_page(kpage);
				794	return err;
				795	}
				796
				797	/*
				798	* try_to_merge_with_ksm_page - like try_to_merge_two_pages,
				799	* but no new kernel page is allocated: kpage must already be a ksm page.
				800	*/
				801	static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
				802	unsigned long addr1,
				803	struct page *page1,
				804	struct page *kpage)
				805	{
				806	struct vm_area_struct *vma;
				807	int err = -EFAULT;
				808
				809	down_read(&mm1->mmap_sem);
				810	vma = find_vma(mm1, addr1);
				811	if (!vma \|\| vma->vm_start > addr1) {
				812	up_read(&mm1->mmap_sem);
				813	return err;
				814	}
				815
				816	err = try_to_merge_one_page(vma, page1, kpage);
				817	up_read(&mm1->mmap_sem);
				818
				819	if (!err)
				820	ksm_pages_shared++;
				821
				822	return err;
				823	}
				824
				825	/*
				826	* stable_tree_search - search page inside the stable tree
				827	* @page: the page that we are searching identical pages to.
				828	* @page2: pointer into identical page that we are holding inside the stable
				829	* tree that we have found.
				830	* @rmap_item: the reverse mapping item
				831	*
				832	* This function checks if there is a page inside the stable tree
				833	* with identical content to the page that we are scanning right now.
				834	*
				835	* This function return rmap_item pointer to the identical item if found,
				836	* NULL otherwise.
				837	*/
				838	static struct rmap_item stable_tree_search(struct page page,
				839	struct page **page2,
				840	struct rmap_item *rmap_item)
				841	{
				842	struct rb_node *node = root_stable_tree.rb_node;
				843
				844	while (node) {
				845	struct rmap_item tree_rmap_item, next_rmap_item;
				846	int ret;
				847
				848	tree_rmap_item = rb_entry(node, struct rmap_item, node);
				849	while (tree_rmap_item) {
				850	BUG_ON(!in_stable_tree(tree_rmap_item));
				851	cond_resched();
				852	page2[0] = get_ksm_page(tree_rmap_item);
				853	if (page2[0])
				854	break;
				855	next_rmap_item = tree_rmap_item->next;
				856	remove_rmap_item_from_tree(tree_rmap_item);
				857	tree_rmap_item = next_rmap_item;
				858	}
				859	if (!tree_rmap_item)
				860	return NULL;
				861
				862	ret = memcmp_pages(page, page2[0]);
				863
				864	if (ret < 0) {
				865	put_page(page2[0]);
				866	node = node->rb_left;
				867	} else if (ret > 0) {
				868	put_page(page2[0]);
				869	node = node->rb_right;
				870	} else {
				871	return tree_rmap_item;
				872	}
				873	}
				874
				875	return NULL;
				876	}
				877
				878	/*
				879	* stable_tree_insert - insert rmap_item pointing to new ksm page
				880	* into the stable tree.
				881	*
				882	* @page: the page that we are searching identical page to inside the stable
				883	* tree.
				884	* @rmap_item: pointer to the reverse mapping item.
				885	*
				886	* This function returns rmap_item if success, NULL otherwise.
				887	*/
				888	static struct rmap_item stable_tree_insert(struct page page,
				889	struct rmap_item *rmap_item)
				890	{
				891	struct rb_node **new = &root_stable_tree.rb_node;
				892	struct rb_node *parent = NULL;
				893
				894	while (*new) {
				895	struct rmap_item tree_rmap_item, next_rmap_item;
				896	struct page *tree_page;
				897	int ret;
				898
				899	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				900	while (tree_rmap_item) {
				901	BUG_ON(!in_stable_tree(tree_rmap_item));
				902	cond_resched();
				903	tree_page = get_ksm_page(tree_rmap_item);
				904	if (tree_page)
				905	break;
				906	next_rmap_item = tree_rmap_item->next;
				907	remove_rmap_item_from_tree(tree_rmap_item);
				908	tree_rmap_item = next_rmap_item;
				909	}
				910	if (!tree_rmap_item)
				911	return NULL;
				912
				913	ret = memcmp_pages(page, tree_page);
				914	put_page(tree_page);
				915
				916	parent = *new;
				917	if (ret < 0)
				918	new = &parent->rb_left;
				919	else if (ret > 0)
				920	new = &parent->rb_right;
				921	else {
				922	/*
				923	* It is not a bug that stable_tree_search() didn't
				924	* find this node: because at that time our page was
				925	* not yet write-protected, so may have changed since.
				926	*/
				927	return NULL;
				928	}
				929	}
				930
				931	ksm_kernel_pages_allocated++;
				932
				933	rmap_item->address \|= NODE_FLAG \| STABLE_FLAG;
				934	rmap_item->next = NULL;
				935	rb_link_node(&rmap_item->node, parent, new);
				936	rb_insert_color(&rmap_item->node, &root_stable_tree);
				937
				938	return rmap_item;
				939	}
				940
				941	/*
				942	* unstable_tree_search_insert - search and insert items into the unstable tree.
				943	*
				944	* @page: the page that we are going to search for identical page or to insert
				945	* into the unstable tree
				946	* @page2: pointer into identical page that was found inside the unstable tree
				947	* @rmap_item: the reverse mapping item of page
				948	*
				949	* This function searches for a page in the unstable tree identical to the
				950	* page currently being scanned; and if no identical page is found in the
				951	* tree, we insert rmap_item as a new object into the unstable tree.
				952	*
				953	* This function returns pointer to rmap_item found to be identical
				954	* to the currently scanned page, NULL otherwise.
				955	*
				956	* This function does both searching and inserting, because they share
				957	* the same walking algorithm in an rbtree.
				958	*/
				959	static struct rmap_item unstable_tree_search_insert(struct page page,
				960	struct page **page2,
				961	struct rmap_item *rmap_item)
				962	{
				963	struct rb_node **new = &root_unstable_tree.rb_node;
				964	struct rb_node *parent = NULL;
				965
				966	while (*new) {
				967	struct rmap_item *tree_rmap_item;
				968	int ret;
				969
				970	tree_rmap_item = rb_entry(*new, struct rmap_item, node);
				971	page2[0] = get_mergeable_page(tree_rmap_item);
				972	if (!page2[0])
				973	return NULL;
				974
				975	/*
				976	* Don't substitute an unswappable ksm page
				977	* just for one good swappable forked page.
				978	*/
				979	if (page == page2[0]) {
				980	put_page(page2[0]);
				981	return NULL;
				982	}
				983
				984	ret = memcmp_pages(page, page2[0]);
				985
				986	parent = *new;
				987	if (ret < 0) {
				988	put_page(page2[0]);
				989	new = &parent->rb_left;
				990	} else if (ret > 0) {
				991	put_page(page2[0]);
				992	new = &parent->rb_right;
				993	} else {
				994	return tree_rmap_item;
				995	}
				996	}
				997
				998	rmap_item->address \|= NODE_FLAG;
				999	rmap_item->address \|= (ksm_scan.seqnr & SEQNR_MASK);
				1000	rb_link_node(&rmap_item->node, parent, new);
				1001	rb_insert_color(&rmap_item->node, &root_unstable_tree);
				1002
				1003	return NULL;
				1004	}
				1005
				1006	/*
				1007	* stable_tree_append - add another rmap_item to the linked list of
				1008	* rmap_items hanging off a given node of the stable tree, all sharing
				1009	* the same ksm page.
				1010	*/
				1011	static void stable_tree_append(struct rmap_item *rmap_item,
				1012	struct rmap_item *tree_rmap_item)
				1013	{
				1014	rmap_item->next = tree_rmap_item->next;
				1015	rmap_item->prev = tree_rmap_item;
				1016
				1017	if (tree_rmap_item->next)
				1018	tree_rmap_item->next->prev = rmap_item;
				1019
				1020	tree_rmap_item->next = rmap_item;
				1021	rmap_item->address \|= STABLE_FLAG;
				1022	}
				1023
				1024	/*
				1025	* cmp_and_merge_page - take a page computes its hash value and check if there
				1026	* is similar hash value to different page,
				1027	* in case we find that there is similar hash to different page we call to
				1028	* try_to_merge_two_pages().
				1029	*
				1030	* @page: the page that we are searching identical page to.
				1031	* @rmap_item: the reverse mapping into the virtual address of this page
				1032	*/
				1033	static void cmp_and_merge_page(struct page page, struct rmap_item rmap_item)
				1034	{
				1035	struct page *page2[1];
				1036	struct rmap_item *tree_rmap_item;
				1037	unsigned int checksum;
				1038	int err;
				1039
				1040	if (in_stable_tree(rmap_item))
				1041	remove_rmap_item_from_tree(rmap_item);
				1042
				1043	/* We first start with searching the page inside the stable tree */
				1044	tree_rmap_item = stable_tree_search(page, page2, rmap_item);
				1045	if (tree_rmap_item) {
				1046	if (page == page2[0]) { /* forked */
				1047	ksm_pages_shared++;
				1048	err = 0;
				1049	} else
				1050	err = try_to_merge_with_ksm_page(rmap_item->mm,
				1051	rmap_item->address,
				1052	page, page2[0]);
				1053	put_page(page2[0]);
				1054
				1055	if (!err) {
				1056	/*
				1057	* The page was successfully merged:
				1058	* add its rmap_item to the stable tree.
				1059	*/
				1060	stable_tree_append(rmap_item, tree_rmap_item);
				1061	}
				1062	return;
				1063	}
				1064
				1065	/*
				1066	* A ksm page might have got here by fork, but its other
				1067	* references have already been removed from the stable tree.
				1068	*/
				1069	if (PageKsm(page))
				1070	break_cow(rmap_item->mm, rmap_item->address);
				1071
				1072	/*
				1073	* In case the hash value of the page was changed from the last time we
				1074	* have calculated it, this page to be changed frequely, therefore we
				1075	* don't want to insert it to the unstable tree, and we don't want to
				1076	* waste our time to search if there is something identical to it there.
				1077	*/
				1078	checksum = calc_checksum(page);
				1079	if (rmap_item->oldchecksum != checksum) {
				1080	rmap_item->oldchecksum = checksum;
				1081	return;
				1082	}
				1083
				1084	tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
				1085	if (tree_rmap_item) {
				1086	err = try_to_merge_two_pages(rmap_item->mm,
				1087	rmap_item->address, page,
				1088	tree_rmap_item->mm,
				1089	tree_rmap_item->address, page2[0]);
				1090	/*
				1091	* As soon as we merge this page, we want to remove the
				1092	* rmap_item of the page we have merged with from the unstable
				1093	* tree, and insert it instead as new node in the stable tree.
				1094	*/
				1095	if (!err) {
				1096	rb_erase(&tree_rmap_item->node, &root_unstable_tree);
				1097	tree_rmap_item->address &= ~NODE_FLAG;
				1098	/*
				1099	* If we fail to insert the page into the stable tree,
				1100	* we will have 2 virtual addresses that are pointing
				1101	* to a ksm page left outside the stable tree,
				1102	* in which case we need to break_cow on both.
				1103	*/
				1104	if (stable_tree_insert(page2[0], tree_rmap_item))
				1105	stable_tree_append(rmap_item, tree_rmap_item);
				1106	else {
				1107	break_cow(tree_rmap_item->mm,
				1108	tree_rmap_item->address);
				1109	break_cow(rmap_item->mm, rmap_item->address);
				1110	ksm_pages_shared -= 2;
				1111	}
				1112	}
				1113
				1114	put_page(page2[0]);
				1115	}
				1116	}
				1117
				1118	static struct rmap_item get_next_rmap_item(struct mm_slot mm_slot,
				1119	struct list_head *cur,
				1120	unsigned long addr)
				1121	{
				1122	struct rmap_item *rmap_item;
				1123
				1124	while (cur != &mm_slot->rmap_list) {
				1125	rmap_item = list_entry(cur, struct rmap_item, link);
				1126	if ((rmap_item->address & PAGE_MASK) == addr) {
				1127	if (!in_stable_tree(rmap_item))
				1128	remove_rmap_item_from_tree(rmap_item);
				1129	return rmap_item;
				1130	}
				1131	if (rmap_item->address > addr)
				1132	break;
				1133	cur = cur->next;
				1134	remove_rmap_item_from_tree(rmap_item);
				1135	list_del(&rmap_item->link);
				1136	free_rmap_item(rmap_item);
				1137	}
				1138
				1139	rmap_item = alloc_rmap_item();
				1140	if (rmap_item) {
				1141	/* It has already been zeroed */
				1142	rmap_item->mm = mm_slot->mm;
				1143	rmap_item->address = addr;
				1144	list_add_tail(&rmap_item->link, cur);
				1145	}
				1146	return rmap_item;
				1147	}
				1148
				1149	static struct rmap_item scan_get_next_rmap_item(struct page *page)
				1150	{
				1151	struct mm_struct *mm;
				1152	struct mm_slot *slot;
				1153	struct vm_area_struct *vma;
				1154	struct rmap_item *rmap_item;
				1155
				1156	if (list_empty(&ksm_mm_head.mm_list))
				1157	return NULL;
				1158
				1159	slot = ksm_scan.mm_slot;
				1160	if (slot == &ksm_mm_head) {
				1161	root_unstable_tree = RB_ROOT;
				1162
				1163	spin_lock(&ksm_mmlist_lock);
				1164	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1165	ksm_scan.mm_slot = slot;
				1166	spin_unlock(&ksm_mmlist_lock);
				1167	next_mm:
				1168	ksm_scan.address = 0;
				1169	ksm_scan.rmap_item = list_entry(&slot->rmap_list,
				1170	struct rmap_item, link);
				1171	}
				1172
				1173	mm = slot->mm;
				1174	down_read(&mm->mmap_sem);
				1175	for (vma = find_vma(mm, ksm_scan.address); vma; vma = vma->vm_next) {
				1176	if (!(vma->vm_flags & VM_MERGEABLE))
				1177	continue;
				1178	if (ksm_scan.address < vma->vm_start)
				1179	ksm_scan.address = vma->vm_start;
				1180	if (!vma->anon_vma)
				1181	ksm_scan.address = vma->vm_end;
				1182
				1183	while (ksm_scan.address < vma->vm_end) {
				1184	*page = follow_page(vma, ksm_scan.address, FOLL_GET);
				1185	if (page && PageAnon(page)) {
				1186	flush_anon_page(vma, *page, ksm_scan.address);
				1187	flush_dcache_page(*page);
				1188	rmap_item = get_next_rmap_item(slot,
				1189	ksm_scan.rmap_item->link.next,
				1190	ksm_scan.address);
				1191	if (rmap_item) {
				1192	ksm_scan.rmap_item = rmap_item;
				1193	ksm_scan.address += PAGE_SIZE;
				1194	} else
				1195	put_page(*page);
				1196	up_read(&mm->mmap_sem);
				1197	return rmap_item;
				1198	}
				1199	if (*page)
				1200	put_page(*page);
				1201	ksm_scan.address += PAGE_SIZE;
				1202	cond_resched();
				1203	}
				1204	}
				1205
				1206	if (!ksm_scan.address) {
				1207	/*
				1208	* We've completed a full scan of all vmas, holding mmap_sem
				1209	* throughout, and found no VM_MERGEABLE: so do the same as
				1210	* __ksm_exit does to remove this mm from all our lists now.
				1211	*/
				1212	remove_mm_from_lists(mm);
				1213	up_read(&mm->mmap_sem);
				1214	slot = ksm_scan.mm_slot;
				1215	if (slot != &ksm_mm_head)
				1216	goto next_mm;
				1217	return NULL;
				1218	}
				1219
				1220	/*
				1221	* Nuke all the rmap_items that are above this current rmap:
				1222	* because there were no VM_MERGEABLE vmas with such addresses.
				1223	*/
				1224	remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
				1225	up_read(&mm->mmap_sem);
				1226
				1227	spin_lock(&ksm_mmlist_lock);
				1228	slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
				1229	ksm_scan.mm_slot = slot;
				1230	spin_unlock(&ksm_mmlist_lock);
				1231
				1232	/* Repeat until we've completed scanning the whole list */
				1233	if (slot != &ksm_mm_head)
				1234	goto next_mm;
				1235
				1236	/*
				1237	* Bump seqnr here rather than at top, so that __ksm_exit
				1238	* can skip rb_erase on unstable tree until we run again.
				1239	*/
				1240	ksm_scan.seqnr++;
				1241	return NULL;
				1242	}
				1243
				1244	/**
				1245	* ksm_do_scan - the ksm scanner main worker function.
				1246	* @scan_npages - number of pages we want to scan before we return.
				1247	*/
				1248	static void ksm_do_scan(unsigned int scan_npages)
				1249	{
				1250	struct rmap_item *rmap_item;
				1251	struct page *page;
				1252
				1253	while (scan_npages--) {
				1254	cond_resched();
				1255	rmap_item = scan_get_next_rmap_item(&page);
				1256	if (!rmap_item)
				1257	return;
				1258	if (!PageKsm(page) \|\| !in_stable_tree(rmap_item))
				1259	cmp_and_merge_page(page, rmap_item);
				1260	put_page(page);
				1261	}
				1262	}
				1263
				1264	static int ksm_scan_thread(void *nothing)
				1265	{
Izik Eidus	339aa62	2009-09-21 17:02:07 -0700	[diff] [blame^]	1266	set_user_nice(current, 5);
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1267
				1268	while (!kthread_should_stop()) {
				1269	if (ksm_run & KSM_RUN_MERGE) {
				1270	mutex_lock(&ksm_thread_mutex);
				1271	ksm_do_scan(ksm_thread_pages_to_scan);
				1272	mutex_unlock(&ksm_thread_mutex);
				1273	schedule_timeout_interruptible(
				1274	msecs_to_jiffies(ksm_thread_sleep_millisecs));
				1275	} else {
				1276	wait_event_interruptible(ksm_thread_wait,
				1277	(ksm_run & KSM_RUN_MERGE) \|\|
				1278	kthread_should_stop());
				1279	}
				1280	}
				1281	return 0;
				1282	}
				1283
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1284	int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
				1285	unsigned long end, int advice, unsigned long *vm_flags)
				1286	{
				1287	struct mm_struct *mm = vma->vm_mm;
				1288
				1289	switch (advice) {
				1290	case MADV_MERGEABLE:
				1291	/*
				1292	* Be somewhat over-protective for now!
				1293	*/
				1294	if (*vm_flags & (VM_MERGEABLE \| VM_SHARED \| VM_MAYSHARE \|
				1295	VM_PFNMAP \| VM_IO \| VM_DONTEXPAND \|
				1296	VM_RESERVED \| VM_HUGETLB \| VM_INSERTPAGE \|
				1297	VM_MIXEDMAP \| VM_SAO))
				1298	return 0; /* just ignore the advice */
				1299
				1300	if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
				1301	if (__ksm_enter(mm) < 0)
				1302	return -EAGAIN;
				1303
				1304	*vm_flags \|= VM_MERGEABLE;
				1305	break;
				1306
				1307	case MADV_UNMERGEABLE:
				1308	if (!(*vm_flags & VM_MERGEABLE))
				1309	return 0; /* just ignore the advice */
				1310
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1311	if (vma->anon_vma)
				1312	unmerge_ksm_pages(vma, start, end);
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1313
				1314	*vm_flags &= ~VM_MERGEABLE;
				1315	break;
				1316	}
				1317
				1318	return 0;
				1319	}
				1320
				1321	int __ksm_enter(struct mm_struct *mm)
				1322	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1323	struct mm_slot *mm_slot = alloc_mm_slot();
				1324	if (!mm_slot)
				1325	return -ENOMEM;
				1326
				1327	spin_lock(&ksm_mmlist_lock);
				1328	insert_to_mm_slots_hash(mm, mm_slot);
				1329	/*
				1330	* Insert just behind the scanning cursor, to let the area settle
				1331	* down a little; when fork is followed by immediate exec, we don't
				1332	* want ksmd to waste time setting up and tearing down an rmap_list.
				1333	*/
				1334	list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list);
				1335	spin_unlock(&ksm_mmlist_lock);
				1336
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1337	set_bit(MMF_VM_MERGEABLE, &mm->flags);
				1338	return 0;
				1339	}
				1340
				1341	void __ksm_exit(struct mm_struct *mm)
				1342	{
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1343	/*
				1344	* This process is exiting: doesn't hold and doesn't need mmap_sem;
				1345	* but we do need to exclude ksmd and other exiters while we modify
				1346	* the various lists and trees.
				1347	*/
				1348	mutex_lock(&ksm_thread_mutex);
				1349	remove_mm_from_lists(mm);
				1350	mutex_unlock(&ksm_thread_mutex);
Hugh Dickins	f8af4da	2009-09-21 17:01:57 -0700	[diff] [blame]	1351	}
Izik Eidus	31dbd01	2009-09-21 17:02:03 -0700	[diff] [blame]	1352
				1353	#define KSM_ATTR_RO(_name) \
				1354	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
				1355	#define KSM_ATTR(_name) \
				1356	static struct kobj_attribute _name##_attr = \
				1357	__ATTR(_name, 0644, _name##_show, _name##_store)
				1358
				1359	static ssize_t sleep_millisecs_show(struct kobject *kobj,
				1360	struct kobj_attribute attr, char buf)
				1361	{
				1362	return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
				1363	}
				1364
				1365	static ssize_t sleep_millisecs_store(struct kobject *kobj,
				1366	struct kobj_attribute *attr,
				1367	const char *buf, size_t count)
				1368	{
				1369	unsigned long msecs;
				1370	int err;
				1371
				1372	err = strict_strtoul(buf, 10, &msecs);
				1373	if (err \|\| msecs > UINT_MAX)
				1374	return -EINVAL;
				1375
				1376	ksm_thread_sleep_millisecs = msecs;
				1377
				1378	return count;
				1379	}
				1380	KSM_ATTR(sleep_millisecs);
				1381
				1382	static ssize_t pages_to_scan_show(struct kobject *kobj,
				1383	struct kobj_attribute attr, char buf)
				1384	{
				1385	return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
				1386	}
				1387
				1388	static ssize_t pages_to_scan_store(struct kobject *kobj,
				1389	struct kobj_attribute *attr,
				1390	const char *buf, size_t count)
				1391	{
				1392	int err;
				1393	unsigned long nr_pages;
				1394
				1395	err = strict_strtoul(buf, 10, &nr_pages);
				1396	if (err \|\| nr_pages > UINT_MAX)
				1397	return -EINVAL;
				1398
				1399	ksm_thread_pages_to_scan = nr_pages;
				1400
				1401	return count;
				1402	}
				1403	KSM_ATTR(pages_to_scan);
				1404
				1405	static ssize_t run_show(struct kobject kobj, struct kobj_attribute attr,
				1406	char *buf)
				1407	{
				1408	return sprintf(buf, "%u\n", ksm_run);
				1409	}
				1410
				1411	static ssize_t run_store(struct kobject kobj, struct kobj_attribute attr,
				1412	const char *buf, size_t count)
				1413	{
				1414	int err;
				1415	unsigned long flags;
				1416
				1417	err = strict_strtoul(buf, 10, &flags);
				1418	if (err \|\| flags > UINT_MAX)
				1419	return -EINVAL;
				1420	if (flags > KSM_RUN_UNMERGE)
				1421	return -EINVAL;
				1422
				1423	/*
				1424	* KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
				1425	* KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
				1426	* breaking COW to free the kernel_pages_allocated (but leaves
				1427	* mm_slots on the list for when ksmd may be set running again).
				1428	*/
				1429
				1430	mutex_lock(&ksm_thread_mutex);
				1431	if (ksm_run != flags) {
				1432	ksm_run = flags;
				1433	if (flags & KSM_RUN_UNMERGE)
				1434	unmerge_and_remove_all_rmap_items();
				1435	}
				1436	mutex_unlock(&ksm_thread_mutex);
				1437
				1438	if (flags & KSM_RUN_MERGE)
				1439	wake_up_interruptible(&ksm_thread_wait);
				1440
				1441	return count;
				1442	}
				1443	KSM_ATTR(run);
				1444
				1445	static ssize_t pages_shared_show(struct kobject *kobj,
				1446	struct kobj_attribute attr, char buf)
				1447	{
				1448	return sprintf(buf, "%lu\n",
				1449	ksm_pages_shared - ksm_kernel_pages_allocated);
				1450	}
				1451	KSM_ATTR_RO(pages_shared);
				1452
				1453	static ssize_t kernel_pages_allocated_show(struct kobject *kobj,
				1454	struct kobj_attribute *attr,
				1455	char *buf)
				1456	{
				1457	return sprintf(buf, "%lu\n", ksm_kernel_pages_allocated);
				1458	}
				1459	KSM_ATTR_RO(kernel_pages_allocated);
				1460
				1461	static ssize_t max_kernel_pages_store(struct kobject *kobj,
				1462	struct kobj_attribute *attr,
				1463	const char *buf, size_t count)
				1464	{
				1465	int err;
				1466	unsigned long nr_pages;
				1467
				1468	err = strict_strtoul(buf, 10, &nr_pages);
				1469	if (err)
				1470	return -EINVAL;
				1471
				1472	ksm_max_kernel_pages = nr_pages;
				1473
				1474	return count;
				1475	}
				1476
				1477	static ssize_t max_kernel_pages_show(struct kobject *kobj,
				1478	struct kobj_attribute attr, char buf)
				1479	{
				1480	return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
				1481	}
				1482	KSM_ATTR(max_kernel_pages);
				1483
				1484	static struct attribute *ksm_attrs[] = {
				1485	&sleep_millisecs_attr.attr,
				1486	&pages_to_scan_attr.attr,
				1487	&run_attr.attr,
				1488	&pages_shared_attr.attr,
				1489	&kernel_pages_allocated_attr.attr,
				1490	&max_kernel_pages_attr.attr,
				1491	NULL,
				1492	};
				1493
				1494	static struct attribute_group ksm_attr_group = {
				1495	.attrs = ksm_attrs,
				1496	.name = "ksm",
				1497	};
				1498
				1499	static int __init ksm_init(void)
				1500	{
				1501	struct task_struct *ksm_thread;
				1502	int err;
				1503
				1504	err = ksm_slab_init();
				1505	if (err)
				1506	goto out;
				1507
				1508	err = mm_slots_hash_init();
				1509	if (err)
				1510	goto out_free1;
				1511
				1512	ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
				1513	if (IS_ERR(ksm_thread)) {
				1514	printk(KERN_ERR "ksm: creating kthread failed\n");
				1515	err = PTR_ERR(ksm_thread);
				1516	goto out_free2;
				1517	}
				1518
				1519	err = sysfs_create_group(mm_kobj, &ksm_attr_group);
				1520	if (err) {
				1521	printk(KERN_ERR "ksm: register sysfs failed\n");
				1522	goto out_free3;
				1523	}
				1524
				1525	return 0;
				1526
				1527	out_free3:
				1528	kthread_stop(ksm_thread);
				1529	out_free2:
				1530	mm_slots_hash_free();
				1531	out_free1:
				1532	ksm_slab_free();
				1533	out:
				1534	return err;
				1535	}
				1536	module_init(ksm_init)