Blame - mm/memcontrol.c - kernel/msm-4.9

blob: 31c4f0cefdeef5eb2f71b855a37566c2c603ee4d [file] [log] [blame]

Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	1	/* memcontrol.c - Memory Controller
				2	*
				3	* Copyright IBM Corporation, 2007
				4	* Author Balbir Singh <balbir@linux.vnet.ibm.com>
				5	*
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	6	* Copyright 2007 OpenVZ SWsoft Inc
				7	* Author: Pavel Emelianov <xemul@openvz.org>
				8	*
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of the GNU General Public License as published by
				11	* the Free Software Foundation; either version 2 of the License, or
				12	* (at your option) any later version.
				13	*
				14	* This program is distributed in the hope that it will be useful,
				15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				17	* GNU General Public License for more details.
				18	*/
				19
				20	#include <linux/res_counter.h>
				21	#include <linux/memcontrol.h>
				22	#include <linux/cgroup.h>
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	23	#include <linux/mm.h>
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	24	#include <linux/page-flags.h>
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	25	#include <linux/backing-dev.h>
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	26	#include <linux/bit_spinlock.h>
				27	#include <linux/rcupdate.h>
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	28	#include <linux/swap.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/fs.h>
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	31
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	32	#include <asm/uaccess.h>
				33
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	34	struct cgroup_subsys mem_cgroup_subsys;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	35	static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	36
				37	/*
				38	* The memory controller data structure. The memory controller controls both
				39	* page cache and RSS per cgroup. We would eventually like to provide
				40	* statistics based on the statistics developed by Rik Van Riel for clock-pro,
				41	* to help the administrator determine what knobs to tune.
				42	*
				43	* TODO: Add a water mark for the memory controller. Reclaim will begin when
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	44	* we hit the water mark. May be even add a low water mark, such that
				45	* no reclaim occurs from a cgroup at it's low water mark, this is
				46	* a feature that will be implemented much later in the future.
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	47	*/
				48	struct mem_cgroup {
				49	struct cgroup_subsys_state css;
				50	/*
				51	* the counter to account for memory usage
				52	*/
				53	struct res_counter res;
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	54	/*
				55	* Per cgroup active and inactive list, similar to the
				56	* per zone LRU lists.
				57	* TODO: Consider making these lists per zone
				58	*/
				59	struct list_head active_list;
				60	struct list_head inactive_list;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	61	/*
				62	* spin_lock to protect the per cgroup LRU
				63	*/
				64	spinlock_t lru_lock;
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	65	unsigned long control_type; /* control RSS or RSS+Pagecache */
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	66	};
				67
				68	/*
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	69	* We use the lower bit of the page->page_cgroup pointer as a bit spin
				70	* lock. We need to ensure that page->page_cgroup is atleast two
				71	* byte aligned (based on comments from Nick Piggin)
				72	*/
				73	#define PAGE_CGROUP_LOCK_BIT 0x0
				74	#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
				75
				76	/*
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	77	* A page_cgroup page is associated with every page descriptor. The
				78	* page_cgroup helps us identify information about the cgroup
				79	*/
				80	struct page_cgroup {
				81	struct list_head lru; /* per cgroup LRU list */
				82	struct page *page;
				83	struct mem_cgroup *mem_cgroup;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	84	atomic_t ref_cnt; /* Helpful when pages move b/w */
				85	/* mapped and cached states */
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	86	int flags;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	87	};
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	88	#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
KAMEZAWA Hiroyuki	3564c7c	2008-02-07 00:14:23 -0800	[diff] [blame^]	89	#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	90
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	91	enum {
				92	MEM_CGROUP_TYPE_UNSPEC = 0,
				93	MEM_CGROUP_TYPE_MAPPED,
				94	MEM_CGROUP_TYPE_CACHED,
				95	MEM_CGROUP_TYPE_ALL,
				96	MEM_CGROUP_TYPE_MAX,
				97	};
				98
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	99	enum charge_type {
				100	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
				101	MEM_CGROUP_CHARGE_TYPE_MAPPED,
				102	};
				103
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	104	static struct mem_cgroup init_mem_cgroup;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	105
				106	static inline
				107	struct mem_cgroup mem_cgroup_from_cont(struct cgroup cont)
				108	{
				109	return container_of(cgroup_subsys_state(cont,
				110	mem_cgroup_subsys_id), struct mem_cgroup,
				111	css);
				112	}
				113
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	114	static inline
				115	struct mem_cgroup mem_cgroup_from_task(struct task_struct p)
				116	{
				117	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				118	struct mem_cgroup, css);
				119	}
				120
				121	void mm_init_cgroup(struct mm_struct mm, struct task_struct p)
				122	{
				123	struct mem_cgroup *mem;
				124
				125	mem = mem_cgroup_from_task(p);
				126	css_get(&mem->css);
				127	mm->mem_cgroup = mem;
				128	}
				129
				130	void mm_free_cgroup(struct mm_struct *mm)
				131	{
				132	css_put(&mm->mem_cgroup->css);
				133	}
				134
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	135	static inline int page_cgroup_locked(struct page *page)
				136	{
				137	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
				138	&page->page_cgroup);
				139	}
				140
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	141	void page_assign_page_cgroup(struct page page, struct page_cgroup pc)
				142	{
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	143	int locked;
				144
				145	/*
				146	* While resetting the page_cgroup we might not hold the
				147	* page_cgroup lock. free_hot_cold_page() is an example
				148	* of such a scenario
				149	*/
				150	if (pc)
				151	VM_BUG_ON(!page_cgroup_locked(page));
				152	locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
				153	page->page_cgroup = ((unsigned long)pc \| locked);
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	154	}
				155
				156	struct page_cgroup page_get_page_cgroup(struct page page)
				157	{
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	158	return (struct page_cgroup *)
				159	(page->page_cgroup & ~PAGE_CGROUP_LOCK);
				160	}
				161
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	162	static void __always_inline lock_page_cgroup(struct page *page)
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	163	{
				164	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
				165	VM_BUG_ON(!page_cgroup_locked(page));
				166	}
				167
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	168	static void __always_inline unlock_page_cgroup(struct page *page)
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	169	{
				170	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
				171	}
				172
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	173	/*
				174	* Tie new page_cgroup to struct page under lock_page_cgroup()
				175	* This can fail if the page has been tied to a page_cgroup.
				176	* If success, returns 0.
				177	*/
				178	static inline int
				179	page_cgroup_assign_new_page_cgroup(struct page page, struct page_cgroup pc)
				180	{
				181	int ret = 0;
				182
				183	lock_page_cgroup(page);
				184	if (!page_get_page_cgroup(page))
				185	page_assign_page_cgroup(page, pc);
				186	else /* A page is tied to other pc. */
				187	ret = 1;
				188	unlock_page_cgroup(page);
				189	return ret;
				190	}
				191
				192	/*
				193	* Clear page->page_cgroup member under lock_page_cgroup().
				194	* If given "pc" value is different from one page->page_cgroup,
				195	* page->cgroup is not cleared.
				196	* Returns a value of page->page_cgroup at lock taken.
				197	* A can can detect failure of clearing by following
				198	* clear_page_cgroup(page, pc) == pc
				199	*/
				200
				201	static inline struct page_cgroup *
				202	clear_page_cgroup(struct page page, struct page_cgroup pc)
				203	{
				204	struct page_cgroup *ret;
				205	/* lock and clear */
				206	lock_page_cgroup(page);
				207	ret = page_get_page_cgroup(page);
				208	if (likely(ret == pc))
				209	page_assign_page_cgroup(page, NULL);
				210	unlock_page_cgroup(page);
				211	return ret;
				212	}
				213
				214
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	215	static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	216	{
KAMEZAWA Hiroyuki	3564c7c	2008-02-07 00:14:23 -0800	[diff] [blame^]	217	if (active) {
				218	pc->flags \|= PAGE_CGROUP_FLAG_ACTIVE;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	219	list_move(&pc->lru, &pc->mem_cgroup->active_list);
KAMEZAWA Hiroyuki	3564c7c	2008-02-07 00:14:23 -0800	[diff] [blame^]	220	} else {
				221	pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	222	list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
KAMEZAWA Hiroyuki	3564c7c	2008-02-07 00:14:23 -0800	[diff] [blame^]	223	}
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	224	}
				225
David Rientjes	4c4a221	2008-02-07 00:14:06 -0800	[diff] [blame]	226	int task_in_mem_cgroup(struct task_struct task, const struct mem_cgroup mem)
				227	{
				228	int ret;
				229
				230	task_lock(task);
				231	ret = task->mm && mm_cgroup(task->mm) == mem;
				232	task_unlock(task);
				233	return ret;
				234	}
				235
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	236	/*
				237	* This routine assumes that the appropriate zone's lru lock is already held
				238	*/
				239	void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
				240	{
				241	struct mem_cgroup *mem;
				242	if (!pc)
				243	return;
				244
				245	mem = pc->mem_cgroup;
				246
				247	spin_lock(&mem->lru_lock);
				248	__mem_cgroup_move_lists(pc, active);
				249	spin_unlock(&mem->lru_lock);
				250	}
				251
				252	unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
				253	struct list_head *dst,
				254	unsigned long *scanned, int order,
				255	int mode, struct zone *z,
				256	struct mem_cgroup *mem_cont,
				257	int active)
				258	{
				259	unsigned long nr_taken = 0;
				260	struct page *page;
				261	unsigned long scan;
				262	LIST_HEAD(pc_list);
				263	struct list_head *src;
KAMEZAWA Hiroyuki	ff7283f	2008-02-07 00:14:11 -0800	[diff] [blame]	264	struct page_cgroup pc, tmp;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	265
				266	if (active)
				267	src = &mem_cont->active_list;
				268	else
				269	src = &mem_cont->inactive_list;
				270
				271	spin_lock(&mem_cont->lru_lock);
KAMEZAWA Hiroyuki	ff7283f	2008-02-07 00:14:11 -0800	[diff] [blame]	272	scan = 0;
				273	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
Hugh Dickins	436c6541	2008-02-07 00:14:12 -0800	[diff] [blame]	274	if (scan >= nr_to_scan)
KAMEZAWA Hiroyuki	ff7283f	2008-02-07 00:14:11 -0800	[diff] [blame]	275	break;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	276	page = pc->page;
				277	VM_BUG_ON(!pc);
				278
Hugh Dickins	436c6541	2008-02-07 00:14:12 -0800	[diff] [blame]	279	if (unlikely(!PageLRU(page)))
KAMEZAWA Hiroyuki	ff7283f	2008-02-07 00:14:11 -0800	[diff] [blame]	280	continue;
KAMEZAWA Hiroyuki	ff7283f	2008-02-07 00:14:11 -0800	[diff] [blame]	281
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	282	if (PageActive(page) && !active) {
				283	__mem_cgroup_move_lists(pc, true);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	284	continue;
				285	}
				286	if (!PageActive(page) && active) {
				287	__mem_cgroup_move_lists(pc, false);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	288	continue;
				289	}
				290
				291	/*
				292	* Reclaim, per zone
				293	* TODO: make the active/inactive lists per zone
				294	*/
				295	if (page_zone(page) != z)
				296	continue;
				297
Hugh Dickins	436c6541	2008-02-07 00:14:12 -0800	[diff] [blame]	298	scan++;
				299	list_move(&pc->lru, &pc_list);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	300
				301	if (__isolate_lru_page(page, mode) == 0) {
				302	list_move(&page->lru, dst);
				303	nr_taken++;
				304	}
				305	}
				306
				307	list_splice(&pc_list, src);
				308	spin_unlock(&mem_cont->lru_lock);
				309
				310	*scanned = scan;
				311	return nr_taken;
				312	}
				313
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	314	/*
				315	* Charge the memory controller for page usage.
				316	* Return
				317	* 0 if the charge was successful
				318	* < 0 if the cgroup is over its limit
				319	*/
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	320	static int mem_cgroup_charge_common(struct page page, struct mm_struct mm,
				321	gfp_t gfp_mask, enum charge_type ctype)
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	322	{
				323	struct mem_cgroup *mem;
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	324	struct page_cgroup *pc;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	325	unsigned long flags;
				326	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	327
				328	/*
				329	* Should page_cgroup's go to their own slab?
				330	* One could optimize the performance of the charging routine
				331	* by saving a bit in the page_flags and using it as a lock
				332	* to see if the cgroup page already has a page_cgroup associated
				333	* with it
				334	*/
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	335	retry:
Hugh Dickins	8236955	2008-02-07 00:14:22 -0800	[diff] [blame]	336	if (page) {
				337	lock_page_cgroup(page);
				338	pc = page_get_page_cgroup(page);
				339	/*
				340	* The page_cgroup exists and
				341	* the page has already been accounted.
				342	*/
				343	if (pc) {
				344	if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
				345	/* this page is under being uncharged ? */
				346	unlock_page_cgroup(page);
				347	cpu_relax();
				348	goto retry;
				349	} else {
				350	unlock_page_cgroup(page);
				351	goto done;
				352	}
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	353	}
Hugh Dickins	8236955	2008-02-07 00:14:22 -0800	[diff] [blame]	354	unlock_page_cgroup(page);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	355	}
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	356
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame]	357	pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	358	if (pc == NULL)
				359	goto err;
				360
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	361	/*
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	362	* We always charge the cgroup the mm_struct belongs to.
				363	* The mm_struct's mem_cgroup changes on task migration if the
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	364	* thread group leader migrates. It's possible that mm is not
				365	* set, if so charge the init_mm (happens for pagecache usage).
				366	*/
				367	if (!mm)
				368	mm = &init_mm;
				369
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	370	rcu_read_lock();
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	371	mem = rcu_dereference(mm->mem_cgroup);
				372	/*
				373	* For every charge from the cgroup, increment reference
				374	* count
				375	*/
				376	css_get(&mem->css);
				377	rcu_read_unlock();
				378
				379	/*
				380	* If we created the page_cgroup, we should free it on exceeding
				381	* the cgroup limit.
				382	*/
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	383	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	384	if (!(gfp_mask & __GFP_WAIT))
				385	goto out;
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame]	386
				387	if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	388	continue;
				389
				390	/*
				391	* try_to_free_mem_cgroup_pages() might not give us a full
				392	* picture of reclaim. Some pages are reclaimed and might be
				393	* moved to swap cache or just unmapped from the cgroup.
				394	* Check the limit again to see if the reclaim reduced the
				395	* current usage of the cgroup before giving up
				396	*/
				397	if (res_counter_check_under_limit(&mem->res))
				398	continue;
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	399
				400	if (!nr_retries--) {
				401	mem_cgroup_out_of_memory(mem, gfp_mask);
				402	goto out;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	403	}
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	404	congestion_wait(WRITE, HZ/10);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	405	}
				406
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	407	atomic_set(&pc->ref_cnt, 1);
				408	pc->mem_cgroup = mem;
				409	pc->page = page;
KAMEZAWA Hiroyuki	3564c7c	2008-02-07 00:14:23 -0800	[diff] [blame^]	410	pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	411	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
				412	pc->flags \|= PAGE_CGROUP_FLAG_CACHE;
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	413
Hugh Dickins	8236955	2008-02-07 00:14:22 -0800	[diff] [blame]	414	if (!page \|\| page_cgroup_assign_new_page_cgroup(page, pc)) {
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	415	/*
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	416	* Another charge has been added to this page already.
				417	* We take lock_page_cgroup(page) again and read
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	418	* page->cgroup, increment refcnt.... just retry is OK.
				419	*/
				420	res_counter_uncharge(&mem->res, PAGE_SIZE);
				421	css_put(&mem->css);
				422	kfree(pc);
Hugh Dickins	8236955	2008-02-07 00:14:22 -0800	[diff] [blame]	423	if (!page)
				424	goto done;
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	425	goto retry;
				426	}
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	427
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	428	spin_lock_irqsave(&mem->lru_lock, flags);
				429	list_add(&pc->lru, &mem->active_list);
				430	spin_unlock_irqrestore(&mem->lru_lock, flags);
				431
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	432	done:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	433	return 0;
Hugh Dickins	3be9127	2008-02-07 00:14:19 -0800	[diff] [blame]	434	out:
				435	css_put(&mem->css);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	436	kfree(pc);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	437	err:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	438	return -ENOMEM;
				439	}
				440
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	441	int mem_cgroup_charge(struct page page, struct mm_struct mm,
				442	gfp_t gfp_mask)
				443	{
				444	return mem_cgroup_charge_common(page, mm, gfp_mask,
				445	MEM_CGROUP_CHARGE_TYPE_MAPPED);
				446	}
				447
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	448	/*
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	449	* See if the cached pages should be charged at all?
				450	*/
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame]	451	int mem_cgroup_cache_charge(struct page page, struct mm_struct mm,
				452	gfp_t gfp_mask)
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	453	{
Balbir Singh	ac44d35	2008-02-07 00:14:18 -0800	[diff] [blame]	454	int ret = 0;
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	455	struct mem_cgroup *mem;
				456	if (!mm)
				457	mm = &init_mm;
				458
Balbir Singh	ac44d35	2008-02-07 00:14:18 -0800	[diff] [blame]	459	rcu_read_lock();
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	460	mem = rcu_dereference(mm->mem_cgroup);
Balbir Singh	ac44d35	2008-02-07 00:14:18 -0800	[diff] [blame]	461	css_get(&mem->css);
				462	rcu_read_unlock();
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	463	if (mem->control_type == MEM_CGROUP_TYPE_ALL)
Balbir Singh	ac44d35	2008-02-07 00:14:18 -0800	[diff] [blame]	464	ret = mem_cgroup_charge_common(page, mm, gfp_mask,
KAMEZAWA Hiroyuki	217bc31	2008-02-07 00:14:17 -0800	[diff] [blame]	465	MEM_CGROUP_CHARGE_TYPE_CACHE);
Balbir Singh	ac44d35	2008-02-07 00:14:18 -0800	[diff] [blame]	466	css_put(&mem->css);
				467	return ret;
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	468	}
				469
				470	/*
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	471	* Uncharging is always a welcome operation, we never complain, simply
				472	* uncharge.
				473	*/
				474	void mem_cgroup_uncharge(struct page_cgroup *pc)
				475	{
				476	struct mem_cgroup *mem;
				477	struct page *page;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	478	unsigned long flags;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	479
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	480	/*
				481	* This can handle cases when a page is not charged at all and we
				482	* are switching between handling the control_type.
				483	*/
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	484	if (!pc)
				485	return;
				486
				487	if (atomic_dec_and_test(&pc->ref_cnt)) {
				488	page = pc->page;
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	489	/*
				490	* get page->cgroup and clear it under lock.
KAMEZAWA Hiroyuki	cc84758	2008-02-07 00:14:16 -0800	[diff] [blame]	491	* force_empty can drop page->cgroup without checking refcnt.
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	492	*/
				493	if (clear_page_cgroup(page, pc) == pc) {
				494	mem = pc->mem_cgroup;
				495	css_put(&mem->css);
				496	res_counter_uncharge(&mem->res, PAGE_SIZE);
				497	spin_lock_irqsave(&mem->lru_lock, flags);
				498	list_del_init(&pc->lru);
				499	spin_unlock_irqrestore(&mem->lru_lock, flags);
				500	kfree(pc);
KAMEZAWA Hiroyuki	9175e03	2008-02-07 00:14:08 -0800	[diff] [blame]	501	}
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	502	}
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	503	}
KAMEZAWA Hiroyuki	ae41be3	2008-02-07 00:14:10 -0800	[diff] [blame]	504	/*
				505	* Returns non-zero if a page (under migration) has valid page_cgroup member.
				506	* Refcnt of page_cgroup is incremented.
				507	*/
				508
				509	int mem_cgroup_prepare_migration(struct page *page)
				510	{
				511	struct page_cgroup *pc;
				512	int ret = 0;
				513	lock_page_cgroup(page);
				514	pc = page_get_page_cgroup(page);
				515	if (pc && atomic_inc_not_zero(&pc->ref_cnt))
				516	ret = 1;
				517	unlock_page_cgroup(page);
				518	return ret;
				519	}
				520
				521	void mem_cgroup_end_migration(struct page *page)
				522	{
				523	struct page_cgroup *pc = page_get_page_cgroup(page);
				524	mem_cgroup_uncharge(pc);
				525	}
				526	/*
				527	* We know both page and newpage are now not-on-LRU and Pg_locked.
				528	* And no race with uncharge() routines because page_cgroup for page
				529	* has extra one reference by mem_cgroup_prepare_migration.
				530	*/
				531
				532	void mem_cgroup_page_migration(struct page page, struct page newpage)
				533	{
				534	struct page_cgroup *pc;
				535	retry:
				536	pc = page_get_page_cgroup(page);
				537	if (!pc)
				538	return;
				539	if (clear_page_cgroup(page, pc) != pc)
				540	goto retry;
				541	pc->page = newpage;
				542	lock_page_cgroup(newpage);
				543	page_assign_page_cgroup(newpage, pc);
				544	unlock_page_cgroup(newpage);
				545	return;
				546	}
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	547
KAMEZAWA Hiroyuki	cc84758	2008-02-07 00:14:16 -0800	[diff] [blame]	548	/*
				549	* This routine traverse page_cgroup in given list and drop them all.
				550	* This routine ignores page_cgroup->ref_cnt.
				551	* And this routine doesn't reclaim page itself, just removes page_cgroup.
				552	*/
				553	#define FORCE_UNCHARGE_BATCH (128)
				554	static void
				555	mem_cgroup_force_empty_list(struct mem_cgroup mem, struct list_head list)
				556	{
				557	struct page_cgroup *pc;
				558	struct page *page;
				559	int count;
				560	unsigned long flags;
				561
				562	retry:
				563	count = FORCE_UNCHARGE_BATCH;
				564	spin_lock_irqsave(&mem->lru_lock, flags);
				565
				566	while (--count && !list_empty(list)) {
				567	pc = list_entry(list->prev, struct page_cgroup, lru);
				568	page = pc->page;
				569	/* Avoid race with charge */
				570	atomic_set(&pc->ref_cnt, 0);
				571	if (clear_page_cgroup(page, pc) == pc) {
				572	css_put(&mem->css);
				573	res_counter_uncharge(&mem->res, PAGE_SIZE);
				574	list_del_init(&pc->lru);
				575	kfree(pc);
				576	} else /* being uncharged ? ...do relax */
				577	break;
				578	}
				579	spin_unlock_irqrestore(&mem->lru_lock, flags);
				580	if (!list_empty(list)) {
				581	cond_resched();
				582	goto retry;
				583	}
				584	return;
				585	}
				586
				587	/*
				588	* make mem_cgroup's charge to be 0 if there is no task.
				589	* This enables deleting this mem_cgroup.
				590	*/
				591
				592	int mem_cgroup_force_empty(struct mem_cgroup *mem)
				593	{
				594	int ret = -EBUSY;
				595	css_get(&mem->css);
				596	/*
				597	* page reclaim code (kswapd etc..) will move pages between
				598	` * active_list <-> inactive_list while we don't take a lock.
				599	* So, we have to do loop here until all lists are empty.
				600	*/
				601	while (!(list_empty(&mem->active_list) &&
				602	list_empty(&mem->inactive_list))) {
				603	if (atomic_read(&mem->css.cgroup->count) > 0)
				604	goto out;
				605	/* drop all page_cgroup in active_list */
				606	mem_cgroup_force_empty_list(mem, &mem->active_list);
				607	/* drop all page_cgroup in inactive_list */
				608	mem_cgroup_force_empty_list(mem, &mem->inactive_list);
				609	}
				610	ret = 0;
				611	out:
				612	css_put(&mem->css);
				613	return ret;
				614	}
				615
				616
				617
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	618	int mem_cgroup_write_strategy(char buf, unsigned long long tmp)
				619	{
				620	*tmp = memparse(buf, &buf);
				621	if (*buf != '\0')
				622	return -EINVAL;
				623
				624	/*
				625	* Round up the value to the closest page size
				626	*/
				627	tmp = ((tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
				628	return 0;
				629	}
				630
				631	static ssize_t mem_cgroup_read(struct cgroup *cont,
				632	struct cftype cft, struct file file,
				633	char __user userbuf, size_t nbytes, loff_t ppos)
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	634	{
				635	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	636	cft->private, userbuf, nbytes, ppos,
				637	NULL);
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	638	}
				639
				640	static ssize_t mem_cgroup_write(struct cgroup cont, struct cftype cft,
				641	struct file file, const char __user userbuf,
				642	size_t nbytes, loff_t *ppos)
				643	{
				644	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	645	cft->private, userbuf, nbytes, ppos,
				646	mem_cgroup_write_strategy);
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	647	}
				648
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	649	static ssize_t mem_control_type_write(struct cgroup *cont,
				650	struct cftype cft, struct file file,
				651	const char __user *userbuf,
				652	size_t nbytes, loff_t *pos)
				653	{
				654	int ret;
				655	char buf, end;
				656	unsigned long tmp;
				657	struct mem_cgroup *mem;
				658
				659	mem = mem_cgroup_from_cont(cont);
				660	buf = kmalloc(nbytes + 1, GFP_KERNEL);
				661	ret = -ENOMEM;
				662	if (buf == NULL)
				663	goto out;
				664
				665	buf[nbytes] = 0;
				666	ret = -EFAULT;
				667	if (copy_from_user(buf, userbuf, nbytes))
				668	goto out_free;
				669
				670	ret = -EINVAL;
				671	tmp = simple_strtoul(buf, &end, 10);
				672	if (*end != '\0')
				673	goto out_free;
				674
				675	if (tmp <= MEM_CGROUP_TYPE_UNSPEC \|\| tmp >= MEM_CGROUP_TYPE_MAX)
				676	goto out_free;
				677
				678	mem->control_type = tmp;
				679	ret = nbytes;
				680	out_free:
				681	kfree(buf);
				682	out:
				683	return ret;
				684	}
				685
				686	static ssize_t mem_control_type_read(struct cgroup *cont,
				687	struct cftype *cft,
				688	struct file file, char __user userbuf,
				689	size_t nbytes, loff_t *ppos)
				690	{
				691	unsigned long val;
				692	char buf[64], *s;
				693	struct mem_cgroup *mem;
				694
				695	mem = mem_cgroup_from_cont(cont);
				696	s = buf;
				697	val = mem->control_type;
				698	s += sprintf(s, "%lu\n", val);
				699	return simple_read_from_buffer((void __user *)userbuf, nbytes,
				700	ppos, buf, s - buf);
				701	}
				702
KAMEZAWA Hiroyuki	cc84758	2008-02-07 00:14:16 -0800	[diff] [blame]	703
				704	static ssize_t mem_force_empty_write(struct cgroup *cont,
				705	struct cftype cft, struct file file,
				706	const char __user *userbuf,
				707	size_t nbytes, loff_t *ppos)
				708	{
				709	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
				710	int ret;
				711	ret = mem_cgroup_force_empty(mem);
				712	if (!ret)
				713	ret = nbytes;
				714	return ret;
				715	}
				716
				717	/*
				718	* Note: This should be removed if cgroup supports write-only file.
				719	*/
				720
				721	static ssize_t mem_force_empty_read(struct cgroup *cont,
				722	struct cftype *cft,
				723	struct file file, char __user userbuf,
				724	size_t nbytes, loff_t *ppos)
				725	{
				726	return -EINVAL;
				727	}
				728
				729
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	730	static struct cftype mem_cgroup_files[] = {
				731	{
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	732	.name = "usage_in_bytes",
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	733	.private = RES_USAGE,
				734	.read = mem_cgroup_read,
				735	},
				736	{
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	737	.name = "limit_in_bytes",
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	738	.private = RES_LIMIT,
				739	.write = mem_cgroup_write,
				740	.read = mem_cgroup_read,
				741	},
				742	{
				743	.name = "failcnt",
				744	.private = RES_FAILCNT,
				745	.read = mem_cgroup_read,
				746	},
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	747	{
				748	.name = "control_type",
				749	.write = mem_control_type_write,
				750	.read = mem_control_type_read,
				751	},
KAMEZAWA Hiroyuki	cc84758	2008-02-07 00:14:16 -0800	[diff] [blame]	752	{
				753	.name = "force_empty",
				754	.write = mem_force_empty_write,
				755	.read = mem_force_empty_read,
				756	},
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	757	};
				758
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	759	static struct mem_cgroup init_mem_cgroup;
				760
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	761	static struct cgroup_subsys_state *
				762	mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
				763	{
				764	struct mem_cgroup *mem;
				765
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	766	if (unlikely((cont->parent) == NULL)) {
				767	mem = &init_mem_cgroup;
				768	init_mm.mem_cgroup = mem;
				769	} else
				770	mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
				771
				772	if (mem == NULL)
				773	return NULL;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	774
				775	res_counter_init(&mem->res);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	776	INIT_LIST_HEAD(&mem->active_list);
				777	INIT_LIST_HEAD(&mem->inactive_list);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	778	spin_lock_init(&mem->lru_lock);
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	779	mem->control_type = MEM_CGROUP_TYPE_ALL;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	780	return &mem->css;
				781	}
				782
				783	static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				784	struct cgroup *cont)
				785	{
				786	kfree(mem_cgroup_from_cont(cont));
				787	}
				788
				789	static int mem_cgroup_populate(struct cgroup_subsys *ss,
				790	struct cgroup *cont)
				791	{
				792	return cgroup_add_files(cont, ss, mem_cgroup_files,
				793	ARRAY_SIZE(mem_cgroup_files));
				794	}
				795
Balbir Singh	67e465a	2008-02-07 00:13:54 -0800	[diff] [blame]	796	static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				797	struct cgroup *cont,
				798	struct cgroup *old_cont,
				799	struct task_struct *p)
				800	{
				801	struct mm_struct *mm;
				802	struct mem_cgroup mem, old_mem;
				803
				804	mm = get_task_mm(p);
				805	if (mm == NULL)
				806	return;
				807
				808	mem = mem_cgroup_from_cont(cont);
				809	old_mem = mem_cgroup_from_cont(old_cont);
				810
				811	if (mem == old_mem)
				812	goto out;
				813
				814	/*
				815	* Only thread group leaders are allowed to migrate, the mm_struct is
				816	* in effect owned by the leader
				817	*/
				818	if (p->tgid != p->pid)
				819	goto out;
				820
				821	css_get(&mem->css);
				822	rcu_assign_pointer(mm->mem_cgroup, mem);
				823	css_put(&old_mem->css);
				824
				825	out:
				826	mmput(mm);
				827	return;
				828	}
				829
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	830	struct cgroup_subsys mem_cgroup_subsys = {
				831	.name = "memory",
				832	.subsys_id = mem_cgroup_subsys_id,
				833	.create = mem_cgroup_create,
				834	.destroy = mem_cgroup_destroy,
				835	.populate = mem_cgroup_populate,
Balbir Singh	67e465a	2008-02-07 00:13:54 -0800	[diff] [blame]	836	.attach = mem_cgroup_move_task,
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	837	.early_init = 1,
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	838	};