Blame - kernel/cgroup_pids.c - kernel/msm-4.9

blob: 806cd7693ac88b0307173ab7bdaf1a8bc44ab173 [file] [log] [blame]

Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame]	1	/*
				2	* Process number limiting controller for cgroups.
				3	*
				4	* Used to allow a cgroup hierarchy to stop any new processes from fork()ing
				5	* after a certain limit is reached.
				6	*
				7	* Since it is trivial to hit the task limit without hitting any kmemcg limits
				8	* in place, PIDs are a fundamental resource. As such, PID exhaustion must be
				9	* preventable in the scope of a cgroup hierarchy by allowing resource limiting
				10	* of the number of tasks in a cgroup.
				11	*
				12	* In order to use the `pids` controller, set the maximum number of tasks in
				13	* pids.max (this is not available in the root cgroup for obvious reasons). The
				14	* number of processes currently in the cgroup is given by pids.current.
				15	* Organisational operations are not blocked by cgroup policies, so it is
				16	* possible to have pids.current > pids.max. However, it is not possible to
				17	* violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
				18	* would cause a cgroup policy to be violated.
				19	*
				20	* To set a cgroup to have no limit, set pids.max to "max". This is the default
				21	* for all new cgroups (N.B. that PID limits are hierarchical, so the most
				22	* stringent limit in the hierarchy is followed).
				23	*
				24	* pids.current tracks all child cgroup hierarchies, so parent/pids.current is
				25	* a superset of parent/child/pids.current.
				26	*
				27	* Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
				28	*
				29	* This file is subject to the terms and conditions of version 2 of the GNU
				30	* General Public License. See the file COPYING in the main directory of the
				31	* Linux distribution for more details.
				32	*/
				33
				34	#include <linux/kernel.h>
				35	#include <linux/threads.h>
				36	#include <linux/atomic.h>
				37	#include <linux/cgroup.h>
				38	#include <linux/slab.h>
				39
				40	#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
				41	#define PIDS_MAX_STR "max"
				42
				43	struct pids_cgroup {
				44	struct cgroup_subsys_state css;
				45
				46	/*
				47	* Use 64-bit types so that we can safely represent "max" as
				48	* %PIDS_MAX = (%PID_MAX_LIMIT + 1).
				49	*/
				50	atomic64_t counter;
				51	int64_t limit;
				52	};
				53
				54	static struct pids_cgroup css_pids(struct cgroup_subsys_state css)
				55	{
				56	return container_of(css, struct pids_cgroup, css);
				57	}
				58
				59	static struct pids_cgroup parent_pids(struct pids_cgroup pids)
				60	{
				61	return css_pids(pids->css.parent);
				62	}
				63
				64	static struct cgroup_subsys_state *
				65	pids_css_alloc(struct cgroup_subsys_state *parent)
				66	{
				67	struct pids_cgroup *pids;
				68
				69	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
				70	if (!pids)
				71	return ERR_PTR(-ENOMEM);
				72
				73	pids->limit = PIDS_MAX;
				74	atomic64_set(&pids->counter, 0);
				75	return &pids->css;
				76	}
				77
				78	static void pids_css_free(struct cgroup_subsys_state *css)
				79	{
				80	kfree(css_pids(css));
				81	}
				82
				83	/**
				84	* pids_cancel - uncharge the local pid count
				85	* @pids: the pid cgroup state
				86	* @num: the number of pids to cancel
				87	*
				88	* This function will WARN if the pid count goes under 0, because such a case is
				89	* a bug in the pids controller proper.
				90	*/
				91	static void pids_cancel(struct pids_cgroup *pids, int num)
				92	{
				93	/*
				94	* A negative count (or overflow for that matter) is invalid,
				95	* and indicates a bug in the `pids` controller proper.
				96	*/
				97	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
				98	}
				99
				100	/**
				101	* pids_uncharge - hierarchically uncharge the pid count
				102	* @pids: the pid cgroup state
				103	* @num: the number of pids to uncharge
				104	*/
				105	static void pids_uncharge(struct pids_cgroup *pids, int num)
				106	{
				107	struct pids_cgroup *p;
				108
				109	for (p = pids; p; p = parent_pids(p))
				110	pids_cancel(p, num);
				111	}
				112
				113	/**
				114	* pids_charge - hierarchically charge the pid count
				115	* @pids: the pid cgroup state
				116	* @num: the number of pids to charge
				117	*
				118	* This function does not follow the pid limit set. It cannot fail and the new
				119	* pid count may exceed the limit. This is only used for reverting failed
				120	* attaches, where there is no other way out than violating the limit.
				121	*/
				122	static void pids_charge(struct pids_cgroup *pids, int num)
				123	{
				124	struct pids_cgroup *p;
				125
				126	for (p = pids; p; p = parent_pids(p))
				127	atomic64_add(num, &p->counter);
				128	}
				129
				130	/**
				131	* pids_try_charge - hierarchically try to charge the pid count
				132	* @pids: the pid cgroup state
				133	* @num: the number of pids to charge
				134	*
				135	* This function follows the set limit. It will fail if the charge would cause
				136	* the new value to exceed the hierarchical limit. Returns 0 if the charge
				137	* succeded, otherwise -EAGAIN.
				138	*/
				139	static int pids_try_charge(struct pids_cgroup *pids, int num)
				140	{
				141	struct pids_cgroup p, q;
				142
				143	for (p = pids; p; p = parent_pids(p)) {
				144	int64_t new = atomic64_add_return(num, &p->counter);
				145
				146	/*
				147	* Since new is capped to the maximum number of pid_t, if
				148	* p->limit is %PIDS_MAX then we know that this test will never
				149	* fail.
				150	*/
				151	if (new > p->limit)
				152	goto revert;
				153	}
				154
				155	return 0;
				156
				157	revert:
				158	for (q = pids; q != p; q = parent_pids(q))
				159	pids_cancel(q, num);
				160	pids_cancel(p, num);
				161
				162	return -EAGAIN;
				163	}
				164
				165	static int pids_can_attach(struct cgroup_subsys_state *css,
				166	struct cgroup_taskset *tset)
				167	{
				168	struct pids_cgroup *pids = css_pids(css);
				169	struct task_struct *task;
				170
				171	cgroup_taskset_for_each(task, tset) {
				172	struct cgroup_subsys_state *old_css;
				173	struct pids_cgroup *old_pids;
				174
				175	/*
Aleksa Sarai	ce52399	2015-08-25 12:50:44 +1000	[diff] [blame^]	176	* No need to pin @old_css between here and cancel_attach()
				177	* because cgroup core protects it from being freed before
				178	* the migration completes or fails.
Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame]	179	*/
Aleksa Sarai	ce52399	2015-08-25 12:50:44 +1000	[diff] [blame^]	180	old_css = task_css(task, pids_cgrp_id);
Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame]	181	old_pids = css_pids(old_css);
				182
				183	pids_charge(pids, 1);
				184	pids_uncharge(old_pids, 1);
				185	}
				186
				187	return 0;
				188	}
				189
				190	static void pids_cancel_attach(struct cgroup_subsys_state *css,
				191	struct cgroup_taskset *tset)
				192	{
				193	struct pids_cgroup *pids = css_pids(css);
				194	struct task_struct *task;
				195
				196	cgroup_taskset_for_each(task, tset) {
				197	struct cgroup_subsys_state *old_css;
				198	struct pids_cgroup *old_pids;
				199
				200	old_css = task_css(task, pids_cgrp_id);
				201	old_pids = css_pids(old_css);
				202
				203	pids_charge(old_pids, 1);
				204	pids_uncharge(pids, 1);
Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame]	205	}
				206	}
				207
Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame]	208	static int pids_can_fork(struct task_struct task, void *priv_p)
				209	{
				210	struct cgroup_subsys_state *css;
				211	struct pids_cgroup *pids;
				212	int err;
				213
				214	/*
				215	* Use the "current" task_css for the pids subsystem as the tentative
				216	* css. It is possible we will charge the wrong hierarchy, in which
				217	* case we will forcefully revert/reapply the charge on the right
				218	* hierarchy after it is committed to the task proper.
				219	*/
				220	css = task_get_css(current, pids_cgrp_id);
				221	pids = css_pids(css);
				222
				223	err = pids_try_charge(pids, 1);
				224	if (err)
				225	goto err_css_put;
				226
				227	*priv_p = css;
				228	return 0;
				229
				230	err_css_put:
				231	css_put(css);
				232	return err;
				233	}
				234
				235	static void pids_cancel_fork(struct task_struct task, void priv)
				236	{
				237	struct cgroup_subsys_state *css = priv;
				238	struct pids_cgroup *pids = css_pids(css);
				239
				240	pids_uncharge(pids, 1);
				241	css_put(css);
				242	}
				243
				244	static void pids_fork(struct task_struct task, void priv)
				245	{
				246	struct cgroup_subsys_state *css;
				247	struct cgroup_subsys_state *old_css = priv;
				248	struct pids_cgroup *pids;
				249	struct pids_cgroup *old_pids = css_pids(old_css);
				250
				251	css = task_get_css(task, pids_cgrp_id);
				252	pids = css_pids(css);
				253
				254	/*
				255	* If the association has changed, we have to revert and reapply the
				256	* charge/uncharge on the wrong hierarchy to the current one. Since
				257	* the association can only change due to an organisation event, its
				258	* okay for us to ignore the limit in this case.
				259	*/
				260	if (pids != old_pids) {
				261	pids_uncharge(old_pids, 1);
				262	pids_charge(pids, 1);
				263	}
				264
				265	css_put(css);
				266	css_put(old_css);
				267	}
				268
				269	static void pids_exit(struct cgroup_subsys_state *css,
				270	struct cgroup_subsys_state *old_css,
				271	struct task_struct *task)
				272	{
				273	struct pids_cgroup *pids = css_pids(old_css);
				274
				275	pids_uncharge(pids, 1);
				276	}
				277
				278	static ssize_t pids_max_write(struct kernfs_open_file of, char buf,
				279	size_t nbytes, loff_t off)
				280	{
				281	struct cgroup_subsys_state *css = of_css(of);
				282	struct pids_cgroup *pids = css_pids(css);
				283	int64_t limit;
				284	int err;
				285
				286	buf = strstrip(buf);
				287	if (!strcmp(buf, PIDS_MAX_STR)) {
				288	limit = PIDS_MAX;
				289	goto set_limit;
				290	}
				291
				292	err = kstrtoll(buf, 0, &limit);
				293	if (err)
				294	return err;
				295
				296	if (limit < 0 \|\| limit >= PIDS_MAX)
				297	return -EINVAL;
				298
				299	set_limit:
				300	/*
				301	* Limit updates don't need to be mutex'd, since it isn't
				302	* critical that any racing fork()s follow the new limit.
				303	*/
				304	pids->limit = limit;
				305	return nbytes;
				306	}
				307
				308	static int pids_max_show(struct seq_file sf, void v)
				309	{
				310	struct cgroup_subsys_state *css = seq_css(sf);
				311	struct pids_cgroup *pids = css_pids(css);
				312	int64_t limit = pids->limit;
				313
				314	if (limit >= PIDS_MAX)
				315	seq_printf(sf, "%s\n", PIDS_MAX_STR);
				316	else
				317	seq_printf(sf, "%lld\n", limit);
				318
				319	return 0;
				320	}
				321
				322	static s64 pids_current_read(struct cgroup_subsys_state *css,
				323	struct cftype *cft)
				324	{
				325	struct pids_cgroup *pids = css_pids(css);
				326
				327	return atomic64_read(&pids->counter);
				328	}
				329
				330	static struct cftype pids_files[] = {
				331	{
				332	.name = "max",
				333	.write = pids_max_write,
				334	.seq_show = pids_max_show,
				335	.flags = CFTYPE_NOT_ON_ROOT,
				336	},
				337	{
				338	.name = "current",
				339	.read_s64 = pids_current_read,
				340	},
				341	{ } /* terminate */
				342	};
				343
				344	struct cgroup_subsys pids_cgrp_subsys = {
				345	.css_alloc = pids_css_alloc,
				346	.css_free = pids_css_free,
Aleksa Sarai	49b786e	2015-06-09 21:32:10 +1000	[diff] [blame]	347	.can_attach = pids_can_attach,
				348	.cancel_attach = pids_cancel_attach,
				349	.can_fork = pids_can_fork,
				350	.cancel_fork = pids_cancel_fork,
				351	.fork = pids_fork,
				352	.exit = pids_exit,
				353	.legacy_cftypes = pids_files,
				354	.dfl_cftypes = pids_files,
				355	};