Blame - mm/memcontrol.c - kernel/msm-4.9

blob: ac8774426fec37095f6868abd715cfd191698d01 [file] [log] [blame]

Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	1	/* memcontrol.c - Memory Controller
				2	*
				3	* Copyright IBM Corporation, 2007
				4	* Author Balbir Singh <balbir@linux.vnet.ibm.com>
				5	*
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	6	* Copyright 2007 OpenVZ SWsoft Inc
				7	* Author: Pavel Emelianov <xemul@openvz.org>
				8	*
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of the GNU General Public License as published by
				11	* the Free Software Foundation; either version 2 of the License, or
				12	* (at your option) any later version.
				13	*
				14	* This program is distributed in the hope that it will be useful,
				15	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				16	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				17	* GNU General Public License for more details.
				18	*/
				19
				20	#include <linux/res_counter.h>
				21	#include <linux/memcontrol.h>
				22	#include <linux/cgroup.h>
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	23	#include <linux/mm.h>
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	24	#include <linux/page-flags.h>
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	25	#include <linux/backing-dev.h>
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	26	#include <linux/bit_spinlock.h>
				27	#include <linux/rcupdate.h>
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	28	#include <linux/swap.h>
				29	#include <linux/spinlock.h>
				30	#include <linux/fs.h>
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	31
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	32	#include <asm/uaccess.h>
				33
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	34	struct cgroup_subsys mem_cgroup_subsys;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	35	static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	36
				37	/*
				38	* The memory controller data structure. The memory controller controls both
				39	* page cache and RSS per cgroup. We would eventually like to provide
				40	* statistics based on the statistics developed by Rik Van Riel for clock-pro,
				41	* to help the administrator determine what knobs to tune.
				42	*
				43	* TODO: Add a water mark for the memory controller. Reclaim will begin when
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	44	* we hit the water mark. May be even add a low water mark, such that
				45	* no reclaim occurs from a cgroup at it's low water mark, this is
				46	* a feature that will be implemented much later in the future.
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	47	*/
				48	struct mem_cgroup {
				49	struct cgroup_subsys_state css;
				50	/*
				51	* the counter to account for memory usage
				52	*/
				53	struct res_counter res;
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	54	/*
				55	* Per cgroup active and inactive list, similar to the
				56	* per zone LRU lists.
				57	* TODO: Consider making these lists per zone
				58	*/
				59	struct list_head active_list;
				60	struct list_head inactive_list;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	61	/*
				62	* spin_lock to protect the per cgroup LRU
				63	*/
				64	spinlock_t lru_lock;
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	65	unsigned long control_type; /* control RSS or RSS+Pagecache */
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	66	};
				67
				68	/*
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	69	* We use the lower bit of the page->page_cgroup pointer as a bit spin
				70	* lock. We need to ensure that page->page_cgroup is atleast two
				71	* byte aligned (based on comments from Nick Piggin)
				72	*/
				73	#define PAGE_CGROUP_LOCK_BIT 0x0
				74	#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
				75
				76	/*
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	77	* A page_cgroup page is associated with every page descriptor. The
				78	* page_cgroup helps us identify information about the cgroup
				79	*/
				80	struct page_cgroup {
				81	struct list_head lru; /* per cgroup LRU list */
				82	struct page *page;
				83	struct mem_cgroup *mem_cgroup;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	84	atomic_t ref_cnt; /* Helpful when pages move b/w */
				85	/* mapped and cached states */
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	86	};
				87
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	88	enum {
				89	MEM_CGROUP_TYPE_UNSPEC = 0,
				90	MEM_CGROUP_TYPE_MAPPED,
				91	MEM_CGROUP_TYPE_CACHED,
				92	MEM_CGROUP_TYPE_ALL,
				93	MEM_CGROUP_TYPE_MAX,
				94	};
				95
				96	static struct mem_cgroup init_mem_cgroup;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	97
				98	static inline
				99	struct mem_cgroup mem_cgroup_from_cont(struct cgroup cont)
				100	{
				101	return container_of(cgroup_subsys_state(cont,
				102	mem_cgroup_subsys_id), struct mem_cgroup,
				103	css);
				104	}
				105
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	106	static inline
				107	struct mem_cgroup mem_cgroup_from_task(struct task_struct p)
				108	{
				109	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
				110	struct mem_cgroup, css);
				111	}
				112
Balbir Singh	bed7161	2008-02-07 00:14:01 -0800	[diff] [blame]	113	inline struct mem_cgroup mm_cgroup(struct mm_struct mm)
				114	{
				115	return rcu_dereference(mm->mem_cgroup);
				116	}
				117
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	118	void mm_init_cgroup(struct mm_struct mm, struct task_struct p)
				119	{
				120	struct mem_cgroup *mem;
				121
				122	mem = mem_cgroup_from_task(p);
				123	css_get(&mem->css);
				124	mm->mem_cgroup = mem;
				125	}
				126
				127	void mm_free_cgroup(struct mm_struct *mm)
				128	{
				129	css_put(&mm->mem_cgroup->css);
				130	}
				131
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	132	static inline int page_cgroup_locked(struct page *page)
				133	{
				134	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
				135	&page->page_cgroup);
				136	}
				137
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	138	void page_assign_page_cgroup(struct page page, struct page_cgroup pc)
				139	{
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	140	int locked;
				141
				142	/*
				143	* While resetting the page_cgroup we might not hold the
				144	* page_cgroup lock. free_hot_cold_page() is an example
				145	* of such a scenario
				146	*/
				147	if (pc)
				148	VM_BUG_ON(!page_cgroup_locked(page));
				149	locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
				150	page->page_cgroup = ((unsigned long)pc \| locked);
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	151	}
				152
				153	struct page_cgroup page_get_page_cgroup(struct page page)
				154	{
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	155	return (struct page_cgroup *)
				156	(page->page_cgroup & ~PAGE_CGROUP_LOCK);
				157	}
				158
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	159	static void __always_inline lock_page_cgroup(struct page *page)
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	160	{
				161	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
				162	VM_BUG_ON(!page_cgroup_locked(page));
				163	}
				164
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	165	static void __always_inline unlock_page_cgroup(struct page *page)
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	166	{
				167	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
				168	}
				169
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	170	static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	171	{
				172	if (active)
				173	list_move(&pc->lru, &pc->mem_cgroup->active_list);
				174	else
				175	list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
				176	}
				177
				178	/*
				179	* This routine assumes that the appropriate zone's lru lock is already held
				180	*/
				181	void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
				182	{
				183	struct mem_cgroup *mem;
				184	if (!pc)
				185	return;
				186
				187	mem = pc->mem_cgroup;
				188
				189	spin_lock(&mem->lru_lock);
				190	__mem_cgroup_move_lists(pc, active);
				191	spin_unlock(&mem->lru_lock);
				192	}
				193
				194	unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
				195	struct list_head *dst,
				196	unsigned long *scanned, int order,
				197	int mode, struct zone *z,
				198	struct mem_cgroup *mem_cont,
				199	int active)
				200	{
				201	unsigned long nr_taken = 0;
				202	struct page *page;
				203	unsigned long scan;
				204	LIST_HEAD(pc_list);
				205	struct list_head *src;
				206	struct page_cgroup *pc;
				207
				208	if (active)
				209	src = &mem_cont->active_list;
				210	else
				211	src = &mem_cont->inactive_list;
				212
				213	spin_lock(&mem_cont->lru_lock);
				214	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
				215	pc = list_entry(src->prev, struct page_cgroup, lru);
				216	page = pc->page;
				217	VM_BUG_ON(!pc);
				218
				219	if (PageActive(page) && !active) {
				220	__mem_cgroup_move_lists(pc, true);
				221	scan--;
				222	continue;
				223	}
				224	if (!PageActive(page) && active) {
				225	__mem_cgroup_move_lists(pc, false);
				226	scan--;
				227	continue;
				228	}
				229
				230	/*
				231	* Reclaim, per zone
				232	* TODO: make the active/inactive lists per zone
				233	*/
				234	if (page_zone(page) != z)
				235	continue;
				236
				237	/*
				238	* Check if the meta page went away from under us
				239	*/
				240	if (!list_empty(&pc->lru))
				241	list_move(&pc->lru, &pc_list);
				242	else
				243	continue;
				244
				245	if (__isolate_lru_page(page, mode) == 0) {
				246	list_move(&page->lru, dst);
				247	nr_taken++;
				248	}
				249	}
				250
				251	list_splice(&pc_list, src);
				252	spin_unlock(&mem_cont->lru_lock);
				253
				254	*scanned = scan;
				255	return nr_taken;
				256	}
				257
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	258	/*
				259	* Charge the memory controller for page usage.
				260	* Return
				261	* 0 if the charge was successful
				262	* < 0 if the cgroup is over its limit
				263	*/
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	264	int mem_cgroup_charge(struct page page, struct mm_struct mm,
				265	gfp_t gfp_mask)
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	266	{
				267	struct mem_cgroup *mem;
				268	struct page_cgroup pc, race_pc;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	269	unsigned long flags;
				270	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	271
				272	/*
				273	* Should page_cgroup's go to their own slab?
				274	* One could optimize the performance of the charging routine
				275	* by saving a bit in the page_flags and using it as a lock
				276	* to see if the cgroup page already has a page_cgroup associated
				277	* with it
				278	*/
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	279	retry:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	280	lock_page_cgroup(page);
				281	pc = page_get_page_cgroup(page);
				282	/*
				283	* The page_cgroup exists and the page has already been accounted
				284	*/
				285	if (pc) {
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	286	if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
				287	/* this page is under being uncharged ? */
				288	unlock_page_cgroup(page);
				289	cpu_relax();
				290	goto retry;
				291	} else
				292	goto done;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	293	}
				294
				295	unlock_page_cgroup(page);
				296
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	297	pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	298	if (pc == NULL)
				299	goto err;
				300
				301	rcu_read_lock();
				302	/*
				303	* We always charge the cgroup the mm_struct belongs to
				304	* the mm_struct's mem_cgroup changes on task migration if the
				305	* thread group leader migrates. It's possible that mm is not
				306	* set, if so charge the init_mm (happens for pagecache usage).
				307	*/
				308	if (!mm)
				309	mm = &init_mm;
				310
				311	mem = rcu_dereference(mm->mem_cgroup);
				312	/*
				313	* For every charge from the cgroup, increment reference
				314	* count
				315	*/
				316	css_get(&mem->css);
				317	rcu_read_unlock();
				318
				319	/*
				320	* If we created the page_cgroup, we should free it on exceeding
				321	* the cgroup limit.
				322	*/
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	323	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	324	bool is_atomic = gfp_mask & GFP_ATOMIC;
				325	/*
				326	* We cannot reclaim under GFP_ATOMIC, fail the charge
				327	*/
				328	if (is_atomic)
				329	goto noreclaim;
				330
				331	if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	332	continue;
				333
				334	/*
				335	* try_to_free_mem_cgroup_pages() might not give us a full
				336	* picture of reclaim. Some pages are reclaimed and might be
				337	* moved to swap cache or just unmapped from the cgroup.
				338	* Check the limit again to see if the reclaim reduced the
				339	* current usage of the cgroup before giving up
				340	*/
				341	if (res_counter_check_under_limit(&mem->res))
				342	continue;
				343	/*
				344	* Since we control both RSS and cache, we end up with a
				345	* very interesting scenario where we end up reclaiming
				346	* memory (essentially RSS), since the memory is pushed
				347	* to swap cache, we eventually end up adding those
				348	* pages back to our list. Hence we give ourselves a
				349	* few chances before we fail
				350	*/
				351	else if (nr_retries--) {
				352	congestion_wait(WRITE, HZ/10);
				353	continue;
				354	}
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	355	noreclaim:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	356	css_put(&mem->css);
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	357	if (!is_atomic)
				358	mem_cgroup_out_of_memory(mem, GFP_KERNEL);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	359	goto free_pc;
				360	}
				361
				362	lock_page_cgroup(page);
				363	/*
				364	* Check if somebody else beat us to allocating the page_cgroup
				365	*/
				366	race_pc = page_get_page_cgroup(page);
				367	if (race_pc) {
				368	kfree(pc);
				369	pc = race_pc;
				370	atomic_inc(&pc->ref_cnt);
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	371	res_counter_uncharge(&mem->res, PAGE_SIZE);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	372	css_put(&mem->css);
				373	goto done;
				374	}
				375
				376	atomic_set(&pc->ref_cnt, 1);
				377	pc->mem_cgroup = mem;
				378	pc->page = page;
				379	page_assign_page_cgroup(page, pc);
				380
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	381	spin_lock_irqsave(&mem->lru_lock, flags);
				382	list_add(&pc->lru, &mem->active_list);
				383	spin_unlock_irqrestore(&mem->lru_lock, flags);
				384
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	385	done:
				386	unlock_page_cgroup(page);
				387	return 0;
				388	free_pc:
				389	kfree(pc);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	390	err:
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	391	return -ENOMEM;
				392	}
				393
				394	/*
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	395	* See if the cached pages should be charged at all?
				396	*/
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	397	int mem_cgroup_cache_charge(struct page page, struct mm_struct mm,
				398	gfp_t gfp_mask)
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	399	{
				400	struct mem_cgroup *mem;
				401	if (!mm)
				402	mm = &init_mm;
				403
				404	mem = rcu_dereference(mm->mem_cgroup);
				405	if (mem->control_type == MEM_CGROUP_TYPE_ALL)
Balbir Singh	e1a1cd5	2008-02-07 00:14:02 -0800	[diff] [blame^]	406	return mem_cgroup_charge(page, mm, gfp_mask);
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	407	else
				408	return 0;
				409	}
				410
				411	/*
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	412	* Uncharging is always a welcome operation, we never complain, simply
				413	* uncharge.
				414	*/
				415	void mem_cgroup_uncharge(struct page_cgroup *pc)
				416	{
				417	struct mem_cgroup *mem;
				418	struct page *page;
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	419	unsigned long flags;
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	420
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	421	/*
				422	* This can handle cases when a page is not charged at all and we
				423	* are switching between handling the control_type.
				424	*/
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	425	if (!pc)
				426	return;
				427
				428	if (atomic_dec_and_test(&pc->ref_cnt)) {
				429	page = pc->page;
				430	lock_page_cgroup(page);
				431	mem = pc->mem_cgroup;
				432	css_put(&mem->css);
				433	page_assign_page_cgroup(page, NULL);
				434	unlock_page_cgroup(page);
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	435	res_counter_uncharge(&mem->res, PAGE_SIZE);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	436
				437	spin_lock_irqsave(&mem->lru_lock, flags);
				438	list_del_init(&pc->lru);
				439	spin_unlock_irqrestore(&mem->lru_lock, flags);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	440	kfree(pc);
				441	}
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	442	}
				443
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	444	int mem_cgroup_write_strategy(char buf, unsigned long long tmp)
				445	{
				446	*tmp = memparse(buf, &buf);
				447	if (*buf != '\0')
				448	return -EINVAL;
				449
				450	/*
				451	* Round up the value to the closest page size
				452	*/
				453	tmp = ((tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
				454	return 0;
				455	}
				456
				457	static ssize_t mem_cgroup_read(struct cgroup *cont,
				458	struct cftype cft, struct file file,
				459	char __user userbuf, size_t nbytes, loff_t ppos)
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	460	{
				461	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	462	cft->private, userbuf, nbytes, ppos,
				463	NULL);
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	464	}
				465
				466	static ssize_t mem_cgroup_write(struct cgroup cont, struct cftype cft,
				467	struct file file, const char __user userbuf,
				468	size_t nbytes, loff_t *ppos)
				469	{
				470	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	471	cft->private, userbuf, nbytes, ppos,
				472	mem_cgroup_write_strategy);
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	473	}
				474
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	475	static ssize_t mem_control_type_write(struct cgroup *cont,
				476	struct cftype cft, struct file file,
				477	const char __user *userbuf,
				478	size_t nbytes, loff_t *pos)
				479	{
				480	int ret;
				481	char buf, end;
				482	unsigned long tmp;
				483	struct mem_cgroup *mem;
				484
				485	mem = mem_cgroup_from_cont(cont);
				486	buf = kmalloc(nbytes + 1, GFP_KERNEL);
				487	ret = -ENOMEM;
				488	if (buf == NULL)
				489	goto out;
				490
				491	buf[nbytes] = 0;
				492	ret = -EFAULT;
				493	if (copy_from_user(buf, userbuf, nbytes))
				494	goto out_free;
				495
				496	ret = -EINVAL;
				497	tmp = simple_strtoul(buf, &end, 10);
				498	if (*end != '\0')
				499	goto out_free;
				500
				501	if (tmp <= MEM_CGROUP_TYPE_UNSPEC \|\| tmp >= MEM_CGROUP_TYPE_MAX)
				502	goto out_free;
				503
				504	mem->control_type = tmp;
				505	ret = nbytes;
				506	out_free:
				507	kfree(buf);
				508	out:
				509	return ret;
				510	}
				511
				512	static ssize_t mem_control_type_read(struct cgroup *cont,
				513	struct cftype *cft,
				514	struct file file, char __user userbuf,
				515	size_t nbytes, loff_t *ppos)
				516	{
				517	unsigned long val;
				518	char buf[64], *s;
				519	struct mem_cgroup *mem;
				520
				521	mem = mem_cgroup_from_cont(cont);
				522	s = buf;
				523	val = mem->control_type;
				524	s += sprintf(s, "%lu\n", val);
				525	return simple_read_from_buffer((void __user *)userbuf, nbytes,
				526	ppos, buf, s - buf);
				527	}
				528
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	529	static struct cftype mem_cgroup_files[] = {
				530	{
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	531	.name = "usage_in_bytes",
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	532	.private = RES_USAGE,
				533	.read = mem_cgroup_read,
				534	},
				535	{
Balbir Singh	0eea103	2008-02-07 00:13:57 -0800	[diff] [blame]	536	.name = "limit_in_bytes",
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	537	.private = RES_LIMIT,
				538	.write = mem_cgroup_write,
				539	.read = mem_cgroup_read,
				540	},
				541	{
				542	.name = "failcnt",
				543	.private = RES_FAILCNT,
				544	.read = mem_cgroup_read,
				545	},
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	546	{
				547	.name = "control_type",
				548	.write = mem_control_type_write,
				549	.read = mem_control_type_read,
				550	},
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	551	};
				552
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	553	static struct mem_cgroup init_mem_cgroup;
				554
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	555	static struct cgroup_subsys_state *
				556	mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
				557	{
				558	struct mem_cgroup *mem;
				559
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	560	if (unlikely((cont->parent) == NULL)) {
				561	mem = &init_mem_cgroup;
				562	init_mm.mem_cgroup = mem;
				563	} else
				564	mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
				565
				566	if (mem == NULL)
				567	return NULL;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	568
				569	res_counter_init(&mem->res);
Balbir Singh	8a9f3cc	2008-02-07 00:13:53 -0800	[diff] [blame]	570	INIT_LIST_HEAD(&mem->active_list);
				571	INIT_LIST_HEAD(&mem->inactive_list);
Balbir Singh	66e1707	2008-02-07 00:13:56 -0800	[diff] [blame]	572	spin_lock_init(&mem->lru_lock);
Balbir Singh	8697d33	2008-02-07 00:13:59 -0800	[diff] [blame]	573	mem->control_type = MEM_CGROUP_TYPE_ALL;
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	574	return &mem->css;
				575	}
				576
				577	static void mem_cgroup_destroy(struct cgroup_subsys *ss,
				578	struct cgroup *cont)
				579	{
				580	kfree(mem_cgroup_from_cont(cont));
				581	}
				582
				583	static int mem_cgroup_populate(struct cgroup_subsys *ss,
				584	struct cgroup *cont)
				585	{
				586	return cgroup_add_files(cont, ss, mem_cgroup_files,
				587	ARRAY_SIZE(mem_cgroup_files));
				588	}
				589
Balbir Singh	67e465a	2008-02-07 00:13:54 -0800	[diff] [blame]	590	static void mem_cgroup_move_task(struct cgroup_subsys *ss,
				591	struct cgroup *cont,
				592	struct cgroup *old_cont,
				593	struct task_struct *p)
				594	{
				595	struct mm_struct *mm;
				596	struct mem_cgroup mem, old_mem;
				597
				598	mm = get_task_mm(p);
				599	if (mm == NULL)
				600	return;
				601
				602	mem = mem_cgroup_from_cont(cont);
				603	old_mem = mem_cgroup_from_cont(old_cont);
				604
				605	if (mem == old_mem)
				606	goto out;
				607
				608	/*
				609	* Only thread group leaders are allowed to migrate, the mm_struct is
				610	* in effect owned by the leader
				611	*/
				612	if (p->tgid != p->pid)
				613	goto out;
				614
				615	css_get(&mem->css);
				616	rcu_assign_pointer(mm->mem_cgroup, mem);
				617	css_put(&old_mem->css);
				618
				619	out:
				620	mmput(mm);
				621	return;
				622	}
				623
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	624	struct cgroup_subsys mem_cgroup_subsys = {
				625	.name = "memory",
				626	.subsys_id = mem_cgroup_subsys_id,
				627	.create = mem_cgroup_create,
				628	.destroy = mem_cgroup_destroy,
				629	.populate = mem_cgroup_populate,
Balbir Singh	67e465a	2008-02-07 00:13:54 -0800	[diff] [blame]	630	.attach = mem_cgroup_move_task,
Pavel Emelianov	78fb746	2008-02-07 00:13:51 -0800	[diff] [blame]	631	.early_init = 1,
Balbir Singh	8cdea7c	2008-02-07 00:13:50 -0800	[diff] [blame]	632	};