Blame - mm/memcontrol.c - kernel/msm-4.9

blob: 9e9ff914c0f164c53914c1653ea140a850383f98 [file] [log] [blame]

Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	1	/* memcontrol.c - Memory Controller
				2	*
				3	* Copyright IBM Corporation, 2007
				4	* Author Balbir Singh <balbir@linux.vnet.ibm.com>
				5	*
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	6	* Copyright 2007 OpenVZ SWsoft Inc
				7	* Author: Pavel Emelianov <xemul@openvz.org>
				8	*
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of the GNU General Public License as published by
				11	* the Free Software Foundation; either version 2 of the License, or
				12	* (at your option) any later version.
				13	*
				14	* This program is distributed in the hope that it will be useful,
				15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				17	* GNU General Public License for more details.
				18	*/
				19
				20	#include <linux/res_counter.h>
				21	#include <linux/memcontrol.h>
				22	#include <linux/cgroup.h>
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	23	#include <linux/mm.h>
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	24	#include <linux/page-flags.h>
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	25	#include <linux/backing-dev.h>
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	26	#include <linux/bit_spinlock.h>
				27	#include <linux/rcupdate.h>
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	28	#include <linux/swap.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/fs.h>
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	31
				32	struct cgroup_subsys mem_cgroup_subsys;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	33	static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	34
				35	/*
				36	* The memory controller data structure. The memory controller controls both
				37	* page cache and RSS per cgroup. We would eventually like to provide
				38	* statistics based on the statistics developed by Rik Van Riel for clock-pro,
				39	* to help the administrator determine what knobs to tune.
				40	*
				41	* TODO: Add a water mark for the memory controller. Reclaim will begin when
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	42	* we hit the water mark. May be even add a low water mark, such that
				43	* no reclaim occurs from a cgroup at it's low water mark, this is
				44	* a feature that will be implemented much later in the future.
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	45	*/
				46	struct mem_cgroup {
				47	struct cgroup_subsys_state css;
				48	/*
				49	* the counter to account for memory usage
				50	*/
				51	struct res_counter res;
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	52	/*
				53	* Per cgroup active and inactive list, similar to the
				54	* per zone LRU lists.
				55	* TODO: Consider making these lists per zone
				56	*/
				57	struct list_head active_list;
				58	struct list_head inactive_list;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	59	/*
				60	* spin_lock to protect the per cgroup LRU
				61	*/
				62	spinlock_t lru_lock;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	63	};
				64
				65	/*
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	66	* We use the lower bit of the page->page_cgroup pointer as a bit spin
				67	* lock. We need to ensure that page->page_cgroup is atleast two
				68	* byte aligned (based on comments from Nick Piggin)
				69	*/
				70	#define PAGE_CGROUP_LOCK_BIT 0x0
				71	#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
				72
				73	/*
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	74	* A page_cgroup page is associated with every page descriptor. The
				75	* page_cgroup helps us identify information about the cgroup
				76	*/
				77	struct page_cgroup {
				78	struct list_head lru; /* per cgroup LRU list */
				79	struct page *page;
				80	struct mem_cgroup *mem_cgroup;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	81	atomic_t ref_cnt; /* Helpful when pages move b/w */
				82	/* mapped and cached states */
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	83	};
				84
				85
				86	static inline
				87	struct mem_cgroup mem_cgroup_from_cont(struct cgroup cont)
				88	{
				89	return container_of(cgroup_subsys_state(cont,
				90	mem_cgroup_subsys_id), struct mem_cgroup,
				91	css);
				92	}
				93
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	94	static inline
				95	struct mem_cgroup mem_cgroup_from_task(struct task_struct p)
				96	{
				97	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				98	struct mem_cgroup, css);
				99	}
				100
				101	void mm_init_cgroup(struct mm_struct mm, struct task_struct p)
				102	{
				103	struct mem_cgroup *mem;
				104
				105	mem = mem_cgroup_from_task(p);
				106	css_get(&mem->css);
				107	mm->mem_cgroup = mem;
				108	}
				109
				110	void mm_free_cgroup(struct mm_struct *mm)
				111	{
				112	css_put(&mm->mem_cgroup->css);
				113	}
				114
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	115	static inline int page_cgroup_locked(struct page *page)
				116	{
				117	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
				118	&page->page_cgroup);
				119	}
				120
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	121	void page_assign_page_cgroup(struct page page, struct page_cgroup pc)
				122	{
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	123	int locked;
				124
				125	/*
				126	* While resetting the page_cgroup we might not hold the
				127	* page_cgroup lock. free_hot_cold_page() is an example
				128	* of such a scenario
				129	*/
				130	if (pc)
				131	VM_BUG_ON(!page_cgroup_locked(page));
				132	locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
				133	page->page_cgroup = ((unsigned long)pc \| locked);
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	134	}
				135
				136	struct page_cgroup page_get_page_cgroup(struct page page)
				137	{
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	138	return (struct page_cgroup *)
				139	(page->page_cgroup & ~PAGE_CGROUP_LOCK);
				140	}
				141
				142	void __always_inline lock_page_cgroup(struct page *page)
				143	{
				144	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
				145	VM_BUG_ON(!page_cgroup_locked(page));
				146	}
				147
				148	void __always_inline unlock_page_cgroup(struct page *page)
				149	{
				150	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
				151	}
				152
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	153	void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
				154	{
				155	if (active)
				156	list_move(&pc->lru, &pc->mem_cgroup->active_list);
				157	else
				158	list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
				159	}
				160
				161	/*
				162	* This routine assumes that the appropriate zone's lru lock is already held
				163	*/
				164	void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
				165	{
				166	struct mem_cgroup *mem;
				167	if (!pc)
				168	return;
				169
				170	mem = pc->mem_cgroup;
				171
				172	spin_lock(&mem->lru_lock);
				173	__mem_cgroup_move_lists(pc, active);
				174	spin_unlock(&mem->lru_lock);
				175	}
				176
				177	unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
				178	struct list_head *dst,
				179	unsigned long *scanned, int order,
				180	int mode, struct zone *z,
				181	struct mem_cgroup *mem_cont,
				182	int active)
				183	{
				184	unsigned long nr_taken = 0;
				185	struct page *page;
				186	unsigned long scan;
				187	LIST_HEAD(pc_list);
				188	struct list_head *src;
				189	struct page_cgroup *pc;
				190
				191	if (active)
				192	src = &mem_cont->active_list;
				193	else
				194	src = &mem_cont->inactive_list;
				195
				196	spin_lock(&mem_cont->lru_lock);
				197	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
				198	pc = list_entry(src->prev, struct page_cgroup, lru);
				199	page = pc->page;
				200	VM_BUG_ON(!pc);
				201
				202	if (PageActive(page) && !active) {
				203	__mem_cgroup_move_lists(pc, true);
				204	scan--;
				205	continue;
				206	}
				207	if (!PageActive(page) && active) {
				208	__mem_cgroup_move_lists(pc, false);
				209	scan--;
				210	continue;
				211	}
				212
				213	/*
				214	* Reclaim, per zone
				215	* TODO: make the active/inactive lists per zone
				216	*/
				217	if (page_zone(page) != z)
				218	continue;
				219
				220	/*
				221	* Check if the meta page went away from under us
				222	*/
				223	if (!list_empty(&pc->lru))
				224	list_move(&pc->lru, &pc_list);
				225	else
				226	continue;
				227
				228	if (__isolate_lru_page(page, mode) == 0) {
				229	list_move(&page->lru, dst);
				230	nr_taken++;
				231	}
				232	}
				233
				234	list_splice(&pc_list, src);
				235	spin_unlock(&mem_cont->lru_lock);
				236
				237	*scanned = scan;
				238	return nr_taken;
				239	}
				240
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	241	/*
				242	* Charge the memory controller for page usage.
				243	* Return
				244	* 0 if the charge was successful
				245	* < 0 if the cgroup is over its limit
				246	*/
				247	int mem_cgroup_charge(struct page page, struct mm_struct mm)
				248	{
				249	struct mem_cgroup *mem;
				250	struct page_cgroup pc, race_pc;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	251	unsigned long flags;
				252	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	253
				254	/*
				255	* Should page_cgroup's go to their own slab?
				256	* One could optimize the performance of the charging routine
				257	* by saving a bit in the page_flags and using it as a lock
				258	* to see if the cgroup page already has a page_cgroup associated
				259	* with it
				260	*/
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	261	retry:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	262	lock_page_cgroup(page);
				263	pc = page_get_page_cgroup(page);
				264	/*
				265	* The page_cgroup exists and the page has already been accounted
				266	*/
				267	if (pc) {
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	268	if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
				269	/* this page is under being uncharged ? */
				270	unlock_page_cgroup(page);
				271	cpu_relax();
				272	goto retry;
				273	} else
				274	goto done;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	275	}
				276
				277	unlock_page_cgroup(page);
				278
				279	pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
				280	if (pc == NULL)
				281	goto err;
				282
				283	rcu_read_lock();
				284	/*
				285	* We always charge the cgroup the mm_struct belongs to
				286	* the mm_struct's mem_cgroup changes on task migration if the
				287	* thread group leader migrates. It's possible that mm is not
				288	* set, if so charge the init_mm (happens for pagecache usage).
				289	*/
				290	if (!mm)
				291	mm = &init_mm;
				292
				293	mem = rcu_dereference(mm->mem_cgroup);
				294	/*
				295	* For every charge from the cgroup, increment reference
				296	* count
				297	*/
				298	css_get(&mem->css);
				299	rcu_read_unlock();
				300
				301	/*
				302	* If we created the page_cgroup, we should free it on exceeding
				303	* the cgroup limit.
				304	*/
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	305	while (res_counter_charge(&mem->res, 1)) {
				306	if (try_to_free_mem_cgroup_pages(mem))
				307	continue;
				308
				309	/*
				310	* try_to_free_mem_cgroup_pages() might not give us a full
				311	* picture of reclaim. Some pages are reclaimed and might be
				312	* moved to swap cache or just unmapped from the cgroup.
				313	* Check the limit again to see if the reclaim reduced the
				314	* current usage of the cgroup before giving up
				315	*/
				316	if (res_counter_check_under_limit(&mem->res))
				317	continue;
				318	/*
				319	* Since we control both RSS and cache, we end up with a
				320	* very interesting scenario where we end up reclaiming
				321	* memory (essentially RSS), since the memory is pushed
				322	* to swap cache, we eventually end up adding those
				323	* pages back to our list. Hence we give ourselves a
				324	* few chances before we fail
				325	*/
				326	else if (nr_retries--) {
				327	congestion_wait(WRITE, HZ/10);
				328	continue;
				329	}
				330
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	331	css_put(&mem->css);
				332	goto free_pc;
				333	}
				334
				335	lock_page_cgroup(page);
				336	/*
				337	* Check if somebody else beat us to allocating the page_cgroup
				338	*/
				339	race_pc = page_get_page_cgroup(page);
				340	if (race_pc) {
				341	kfree(pc);
				342	pc = race_pc;
				343	atomic_inc(&pc->ref_cnt);
				344	res_counter_uncharge(&mem->res, 1);
				345	css_put(&mem->css);
				346	goto done;
				347	}
				348
				349	atomic_set(&pc->ref_cnt, 1);
				350	pc->mem_cgroup = mem;
				351	pc->page = page;
				352	page_assign_page_cgroup(page, pc);
				353
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	354	spin_lock_irqsave(&mem->lru_lock, flags);
				355	list_add(&pc->lru, &mem->active_list);
				356	spin_unlock_irqrestore(&mem->lru_lock, flags);
				357
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	358	done:
				359	unlock_page_cgroup(page);
				360	return 0;
				361	free_pc:
				362	kfree(pc);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	363	err:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	364	return -ENOMEM;
				365	}
				366
				367	/*
				368	* Uncharging is always a welcome operation, we never complain, simply
				369	* uncharge.
				370	*/
				371	void mem_cgroup_uncharge(struct page_cgroup *pc)
				372	{
				373	struct mem_cgroup *mem;
				374	struct page *page;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	375	unsigned long flags;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	376
				377	if (!pc)
				378	return;
				379
				380	if (atomic_dec_and_test(&pc->ref_cnt)) {
				381	page = pc->page;
				382	lock_page_cgroup(page);
				383	mem = pc->mem_cgroup;
				384	css_put(&mem->css);
				385	page_assign_page_cgroup(page, NULL);
				386	unlock_page_cgroup(page);
				387	res_counter_uncharge(&mem->res, 1);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	388
				389	spin_lock_irqsave(&mem->lru_lock, flags);
				390	list_del_init(&pc->lru);
				391	spin_unlock_irqrestore(&mem->lru_lock, flags);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	392	kfree(pc);
				393	}
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	394	}
				395
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	396	static ssize_t mem_cgroup_read(struct cgroup cont, struct cftype cft,
				397	struct file file, char __user userbuf, size_t nbytes,
				398	loff_t *ppos)
				399	{
				400	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
				401	cft->private, userbuf, nbytes, ppos);
				402	}
				403
				404	static ssize_t mem_cgroup_write(struct cgroup cont, struct cftype cft,
				405	struct file file, const char __user userbuf,
				406	size_t nbytes, loff_t *ppos)
				407	{
				408	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
				409	cft->private, userbuf, nbytes, ppos);
				410	}
				411
				412	static struct cftype mem_cgroup_files[] = {
				413	{
				414	.name = "usage",
				415	.private = RES_USAGE,
				416	.read = mem_cgroup_read,
				417	},
				418	{
				419	.name = "limit",
				420	.private = RES_LIMIT,
				421	.write = mem_cgroup_write,
				422	.read = mem_cgroup_read,
				423	},
				424	{
				425	.name = "failcnt",
				426	.private = RES_FAILCNT,
				427	.read = mem_cgroup_read,
				428	},
				429	};
				430
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	431	static struct mem_cgroup init_mem_cgroup;
				432
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	433	static struct cgroup_subsys_state *
				434	mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
				435	{
				436	struct mem_cgroup *mem;
				437
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	438	if (unlikely((cont->parent) == NULL)) {
				439	mem = &init_mem_cgroup;
				440	init_mm.mem_cgroup = mem;
				441	} else
				442	mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
				443
				444	if (mem == NULL)
				445	return NULL;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	446
				447	res_counter_init(&mem->res);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	448	INIT_LIST_HEAD(&mem->active_list);
				449	INIT_LIST_HEAD(&mem->inactive_list);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame^]	450	spin_lock_init(&mem->lru_lock);
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	451	return &mem->css;
				452	}
				453
				454	static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				455	struct cgroup *cont)
				456	{
				457	kfree(mem_cgroup_from_cont(cont));
				458	}
				459
				460	static int mem_cgroup_populate(struct cgroup_subsys *ss,
				461	struct cgroup *cont)
				462	{
				463	return cgroup_add_files(cont, ss, mem_cgroup_files,
				464	ARRAY_SIZE(mem_cgroup_files));
				465	}
				466
Balbir Singh	67e465a	2008-02-07 00:13:54 -0800	[diff] [blame]	467	static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				468	struct cgroup *cont,
				469	struct cgroup *old_cont,
				470	struct task_struct *p)
				471	{
				472	struct mm_struct *mm;
				473	struct mem_cgroup mem, old_mem;
				474
				475	mm = get_task_mm(p);
				476	if (mm == NULL)
				477	return;
				478
				479	mem = mem_cgroup_from_cont(cont);
				480	old_mem = mem_cgroup_from_cont(old_cont);
				481
				482	if (mem == old_mem)
				483	goto out;
				484
				485	/*
				486	* Only thread group leaders are allowed to migrate, the mm_struct is
				487	* in effect owned by the leader
				488	*/
				489	if (p->tgid != p->pid)
				490	goto out;
				491
				492	css_get(&mem->css);
				493	rcu_assign_pointer(mm->mem_cgroup, mem);
				494	css_put(&old_mem->css);
				495
				496	out:
				497	mmput(mm);
				498	return;
				499	}
				500
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	501	struct cgroup_subsys mem_cgroup_subsys = {
				502	.name = "memory",
				503	.subsys_id = mem_cgroup_subsys_id,
				504	.create = mem_cgroup_create,
				505	.destroy = mem_cgroup_destroy,
				506	.populate = mem_cgroup_populate,
Balbir Singh	67e465a	2008-02-07 00:13:54 -0800	[diff] [blame]	507	.attach = mem_cgroup_move_task,
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	508	.early_init = 1,
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	509	};