Blame - kernel/cgroup.c - kernel/msm-4.9

blob: 7bb520aaf0a39bf630bcfc09ca22fde582a3acb3 [file] [log] [blame]

Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1	/*
				2	* kernel/cgroup.c
				3	*
				4	* Generic process-grouping system.
				5	*
				6	* Based originally on the cpuset system, extracted by Paul Menage
				7	* Copyright (C) 2006 Google, Inc
				8	*
				9	* Copyright notices from the original cpuset code:
				10	* --------------------------------------------------
				11	* Copyright (C) 2003 BULL SA.
				12	* Copyright (C) 2004-2006 Silicon Graphics, Inc.
				13	*
				14	* Portions derived from Patrick Mochel's sysfs code.
				15	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				16	*
				17	* 2003-10-10 Written by Simon Derr.
				18	* 2003-10-22 Updates by Stephen Hemminger.
				19	* 2004 May-July Rework by Paul Jackson.
				20	* ---------------------------------------------------
				21	*
				22	* This file is subject to the terms and conditions of the GNU General Public
				23	* License. See the file COPYING in the main directory of the Linux
				24	* distribution for more details.
				25	*/
				26
				27	#include <linux/cgroup.h>
				28	#include <linux/errno.h>
				29	#include <linux/fs.h>
				30	#include <linux/kernel.h>
				31	#include <linux/list.h>
				32	#include <linux/mm.h>
				33	#include <linux/mutex.h>
				34	#include <linux/mount.h>
				35	#include <linux/pagemap.h>
				36	#include <linux/rcupdate.h>
				37	#include <linux/sched.h>
				38	#include <linux/seq_file.h>
				39	#include <linux/slab.h>
				40	#include <linux/magic.h>
				41	#include <linux/spinlock.h>
				42	#include <linux/string.h>
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	43	#include <linux/sort.h>
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	44	#include <asm/atomic.h>
				45
				46	/* Generate an array of cgroup subsystem pointers */
				47	#define SUBSYS(_x) &_x ## _subsys,
				48
				49	static struct cgroup_subsys *subsys[] = {
				50	#include <linux/cgroup_subsys.h>
				51	};
				52
				53	/*
				54	* A cgroupfs_root represents the root of a cgroup hierarchy,
				55	* and may be associated with a superblock to form an active
				56	* hierarchy
				57	*/
				58	struct cgroupfs_root {
				59	struct super_block *sb;
				60
				61	/*
				62	* The bitmask of subsystems intended to be attached to this
				63	* hierarchy
				64	*/
				65	unsigned long subsys_bits;
				66
				67	/* The bitmask of subsystems currently attached to this hierarchy */
				68	unsigned long actual_subsys_bits;
				69
				70	/* A list running through the attached subsystems */
				71	struct list_head subsys_list;
				72
				73	/* The root cgroup for this hierarchy */
				74	struct cgroup top_cgroup;
				75
				76	/* Tracks how many cgroups are currently defined in hierarchy.*/
				77	int number_of_cgroups;
				78
				79	/* A list running through the mounted hierarchies */
				80	struct list_head root_list;
				81
				82	/* Hierarchy-specific flags */
				83	unsigned long flags;
				84	};
				85
				86
				87	/*
				88	* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
				89	* subsystems that are otherwise unattached - it never has more than a
				90	* single cgroup, and all tasks are part of that cgroup.
				91	*/
				92	static struct cgroupfs_root rootnode;
				93
				94	/* The list of hierarchy roots */
				95
				96	static LIST_HEAD(roots);
				97
				98	/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
				99	#define dummytop (&rootnode.top_cgroup)
				100
				101	/* This flag indicates whether tasks in the fork and exit paths should
				102	* take callback_mutex and check for fork/exit handlers to call. This
				103	* avoids us having to do extra work in the fork/exit path if none of the
				104	* subsystems need to be called.
				105	*/
				106	static int need_forkexit_callback;
				107
				108	/* bits in struct cgroup flags field */
				109	enum {
				110	CONT_REMOVED,
				111	};
				112
				113	/* convenient tests for these bits */
				114	inline int cgroup_is_removed(const struct cgroup *cont)
				115	{
				116	return test_bit(CONT_REMOVED, &cont->flags);
				117	}
				118
				119	/* bits in struct cgroupfs_root flags field */
				120	enum {
				121	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
				122	};
				123
				124	/*
				125	* for_each_subsys() allows you to iterate on each subsystem attached to
				126	* an active hierarchy
				127	*/
				128	#define for_each_subsys(_root, _ss) \
				129	list_for_each_entry(_ss, &_root->subsys_list, sibling)
				130
				131	/* for_each_root() allows you to iterate across the active hierarchies */
				132	#define for_each_root(_root) \
				133	list_for_each_entry(_root, &roots, root_list)
				134
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame^]	135	/* Each task_struct has an embedded css_set, so the get/put
				136	* operation simply takes a reference count on all the cgroups
				137	* referenced by subsystems in this css_set. This can end up
				138	* multiple-counting some cgroups, but that's OK - the ref-count is
				139	* just a busy/not-busy indicator; ensuring that we only count each
				140	* cgroup once would require taking a global lock to ensure that no
				141	* subsystems moved between hierarchies while we were doing so.
				142	*
				143	* Possible TODO: decide at boot time based on the number of
				144	* registered subsystems and the number of CPUs or NUMA nodes whether
				145	* it's better for performance to ref-count every subsystem, or to
				146	* take a global lock and only add one ref count to each hierarchy.
				147	*/
				148	static void get_css_set(struct css_set *cg)
				149	{
				150	int i;
				151	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
				152	atomic_inc(&cg->subsys[i]->cgroup->count);
				153	}
				154
				155	static void put_css_set(struct css_set *cg)
				156	{
				157	int i;
				158	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
				159	atomic_dec(&cg->subsys[i]->cgroup->count);
				160	}
				161
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	162	/*
				163	* There is one global cgroup mutex. We also require taking
				164	* task_lock() when dereferencing a task's cgroup subsys pointers.
				165	* See "The task_lock() exception", at the end of this comment.
				166	*
				167	* A task must hold cgroup_mutex to modify cgroups.
				168	*
				169	* Any task can increment and decrement the count field without lock.
				170	* So in general, code holding cgroup_mutex can't rely on the count
				171	* field not changing. However, if the count goes to zero, then only
				172	* attach_task() can increment it again. Because a count of zero
				173	* means that no tasks are currently attached, therefore there is no
				174	* way a task attached to that cgroup can fork (the other way to
				175	* increment the count). So code holding cgroup_mutex can safely
				176	* assume that if the count is zero, it will stay zero. Similarly, if
				177	* a task holds cgroup_mutex on a cgroup with zero count, it
				178	* knows that the cgroup won't be removed, as cgroup_rmdir()
				179	* needs that mutex.
				180	*
				181	* The cgroup_common_file_write handler for operations that modify
				182	* the cgroup hierarchy holds cgroup_mutex across the entire operation,
				183	* single threading all such cgroup modifications across the system.
				184	*
				185	* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
				186	* (usually) take cgroup_mutex. These are the two most performance
				187	* critical pieces of code here. The exception occurs on cgroup_exit(),
				188	* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
				189	* is taken, and if the cgroup count is zero, a usermode call made
				190	* to /sbin/cgroup_release_agent with the name of the cgroup (path
				191	* relative to the root of cgroup file system) as the argument.
				192	*
				193	* A cgroup can only be deleted if both its 'count' of using tasks
				194	* is zero, and its list of 'children' cgroups is empty. Since all
				195	* tasks in the system use _some_ cgroup, and since there is always at
				196	* least one task in the system (init, pid == 1), therefore, top_cgroup
				197	* always has either children cgroups and/or using tasks. So we don't
				198	* need a special hack to ensure that top_cgroup cannot be deleted.
				199	*
				200	* The task_lock() exception
				201	*
				202	* The need for this exception arises from the action of
				203	* attach_task(), which overwrites one tasks cgroup pointer with
				204	* another. It does so using cgroup_mutexe, however there are
				205	* several performance critical places that need to reference
				206	* task->cgroup without the expense of grabbing a system global
				207	* mutex. Therefore except as noted below, when dereferencing or, as
				208	* in attach_task(), modifying a task'ss cgroup pointer we use
				209	* task_lock(), which acts on a spinlock (task->alloc_lock) already in
				210	* the task_struct routinely used for such matters.
				211	*
				212	* P.S. One more locking exception. RCU is used to guard the
				213	* update of a tasks cgroup pointer by attach_task()
				214	*/
				215
				216	static DEFINE_MUTEX(cgroup_mutex);
				217
				218	/**
				219	* cgroup_lock - lock out any changes to cgroup structures
				220	*
				221	*/
				222
				223	void cgroup_lock(void)
				224	{
				225	mutex_lock(&cgroup_mutex);
				226	}
				227
				228	/**
				229	* cgroup_unlock - release lock on cgroup changes
				230	*
				231	* Undo the lock taken in a previous cgroup_lock() call.
				232	*/
				233
				234	void cgroup_unlock(void)
				235	{
				236	mutex_unlock(&cgroup_mutex);
				237	}
				238
				239	/*
				240	* A couple of forward declarations required, due to cyclic reference loop:
				241	* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
				242	* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
				243	* -> cgroup_mkdir.
				244	*/
				245
				246	static int cgroup_mkdir(struct inode dir, struct dentry dentry, int mode);
				247	static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry);
				248	static int cgroup_populate_dir(struct cgroup *cont);
				249	static struct inode_operations cgroup_dir_inode_operations;
				250
				251	static struct inode cgroup_new_inode(mode_t mode, struct super_block sb)
				252	{
				253	struct inode *inode = new_inode(sb);
				254	static struct backing_dev_info cgroup_backing_dev_info = {
				255	.capabilities = BDI_CAP_NO_ACCT_DIRTY \| BDI_CAP_NO_WRITEBACK,
				256	};
				257
				258	if (inode) {
				259	inode->i_mode = mode;
				260	inode->i_uid = current->fsuid;
				261	inode->i_gid = current->fsgid;
				262	inode->i_blocks = 0;
				263	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				264	inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
				265	}
				266	return inode;
				267	}
				268
				269	static void cgroup_diput(struct dentry dentry, struct inode inode)
				270	{
				271	/* is dentry a directory ? if so, kfree() associated cgroup */
				272	if (S_ISDIR(inode->i_mode)) {
				273	struct cgroup *cont = dentry->d_fsdata;
				274	BUG_ON(!(cgroup_is_removed(cont)));
				275	kfree(cont);
				276	}
				277	iput(inode);
				278	}
				279
				280	static void remove_dir(struct dentry *d)
				281	{
				282	struct dentry *parent = dget(d->d_parent);
				283
				284	d_delete(d);
				285	simple_rmdir(parent->d_inode, d);
				286	dput(parent);
				287	}
				288
				289	static void cgroup_clear_directory(struct dentry *dentry)
				290	{
				291	struct list_head *node;
				292
				293	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
				294	spin_lock(&dcache_lock);
				295	node = dentry->d_subdirs.next;
				296	while (node != &dentry->d_subdirs) {
				297	struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
				298	list_del_init(node);
				299	if (d->d_inode) {
				300	/* This should never be called on a cgroup
				301	* directory with child cgroups */
				302	BUG_ON(d->d_inode->i_mode & S_IFDIR);
				303	d = dget_locked(d);
				304	spin_unlock(&dcache_lock);
				305	d_delete(d);
				306	simple_unlink(dentry->d_inode, d);
				307	dput(d);
				308	spin_lock(&dcache_lock);
				309	}
				310	node = dentry->d_subdirs.next;
				311	}
				312	spin_unlock(&dcache_lock);
				313	}
				314
				315	/*
				316	* NOTE : the dentry must have been dget()'ed
				317	*/
				318	static void cgroup_d_remove_dir(struct dentry *dentry)
				319	{
				320	cgroup_clear_directory(dentry);
				321
				322	spin_lock(&dcache_lock);
				323	list_del_init(&dentry->d_u.d_child);
				324	spin_unlock(&dcache_lock);
				325	remove_dir(dentry);
				326	}
				327
				328	static int rebind_subsystems(struct cgroupfs_root *root,
				329	unsigned long final_bits)
				330	{
				331	unsigned long added_bits, removed_bits;
				332	struct cgroup *cont = &root->top_cgroup;
				333	int i;
				334
				335	removed_bits = root->actual_subsys_bits & ~final_bits;
				336	added_bits = final_bits & ~root->actual_subsys_bits;
				337	/* Check that any added subsystems are currently free */
				338	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				339	unsigned long long bit = 1ull << i;
				340	struct cgroup_subsys *ss = subsys[i];
				341	if (!(bit & added_bits))
				342	continue;
				343	if (ss->root != &rootnode) {
				344	/* Subsystem isn't free */
				345	return -EBUSY;
				346	}
				347	}
				348
				349	/* Currently we don't handle adding/removing subsystems when
				350	* any child cgroups exist. This is theoretically supportable
				351	* but involves complex error handling, so it's being left until
				352	* later */
				353	if (!list_empty(&cont->children))
				354	return -EBUSY;
				355
				356	/* Process each subsystem */
				357	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				358	struct cgroup_subsys *ss = subsys[i];
				359	unsigned long bit = 1UL << i;
				360	if (bit & added_bits) {
				361	/* We're binding this subsystem to this hierarchy */
				362	BUG_ON(cont->subsys[i]);
				363	BUG_ON(!dummytop->subsys[i]);
				364	BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
				365	cont->subsys[i] = dummytop->subsys[i];
				366	cont->subsys[i]->cgroup = cont;
				367	list_add(&ss->sibling, &root->subsys_list);
				368	rcu_assign_pointer(ss->root, root);
				369	if (ss->bind)
				370	ss->bind(ss, cont);
				371
				372	} else if (bit & removed_bits) {
				373	/* We're removing this subsystem */
				374	BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
				375	BUG_ON(cont->subsys[i]->cgroup != cont);
				376	if (ss->bind)
				377	ss->bind(ss, dummytop);
				378	dummytop->subsys[i]->cgroup = dummytop;
				379	cont->subsys[i] = NULL;
				380	rcu_assign_pointer(subsys[i]->root, &rootnode);
				381	list_del(&ss->sibling);
				382	} else if (bit & final_bits) {
				383	/* Subsystem state should already exist */
				384	BUG_ON(!cont->subsys[i]);
				385	} else {
				386	/* Subsystem state shouldn't exist */
				387	BUG_ON(cont->subsys[i]);
				388	}
				389	}
				390	root->subsys_bits = root->actual_subsys_bits = final_bits;
				391	synchronize_rcu();
				392
				393	return 0;
				394	}
				395
				396	static int cgroup_show_options(struct seq_file seq, struct vfsmount vfs)
				397	{
				398	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
				399	struct cgroup_subsys *ss;
				400
				401	mutex_lock(&cgroup_mutex);
				402	for_each_subsys(root, ss)
				403	seq_printf(seq, ",%s", ss->name);
				404	if (test_bit(ROOT_NOPREFIX, &root->flags))
				405	seq_puts(seq, ",noprefix");
				406	mutex_unlock(&cgroup_mutex);
				407	return 0;
				408	}
				409
				410	struct cgroup_sb_opts {
				411	unsigned long subsys_bits;
				412	unsigned long flags;
				413	};
				414
				415	/* Convert a hierarchy specifier into a bitmask of subsystems and
				416	* flags. */
				417	static int parse_cgroupfs_options(char *data,
				418	struct cgroup_sb_opts *opts)
				419	{
				420	char token, o = data ?: "all";
				421
				422	opts->subsys_bits = 0;
				423	opts->flags = 0;
				424
				425	while ((token = strsep(&o, ",")) != NULL) {
				426	if (!*token)
				427	return -EINVAL;
				428	if (!strcmp(token, "all")) {
				429	opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
				430	} else if (!strcmp(token, "noprefix")) {
				431	set_bit(ROOT_NOPREFIX, &opts->flags);
				432	} else {
				433	struct cgroup_subsys *ss;
				434	int i;
				435	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				436	ss = subsys[i];
				437	if (!strcmp(token, ss->name)) {
				438	set_bit(i, &opts->subsys_bits);
				439	break;
				440	}
				441	}
				442	if (i == CGROUP_SUBSYS_COUNT)
				443	return -ENOENT;
				444	}
				445	}
				446
				447	/* We can't have an empty hierarchy */
				448	if (!opts->subsys_bits)
				449	return -EINVAL;
				450
				451	return 0;
				452	}
				453
				454	static int cgroup_remount(struct super_block sb, int flags, char *data)
				455	{
				456	int ret = 0;
				457	struct cgroupfs_root *root = sb->s_fs_info;
				458	struct cgroup *cont = &root->top_cgroup;
				459	struct cgroup_sb_opts opts;
				460
				461	mutex_lock(&cont->dentry->d_inode->i_mutex);
				462	mutex_lock(&cgroup_mutex);
				463
				464	/* See what subsystems are wanted */
				465	ret = parse_cgroupfs_options(data, &opts);
				466	if (ret)
				467	goto out_unlock;
				468
				469	/* Don't allow flags to change at remount */
				470	if (opts.flags != root->flags) {
				471	ret = -EINVAL;
				472	goto out_unlock;
				473	}
				474
				475	ret = rebind_subsystems(root, opts.subsys_bits);
				476
				477	/* (re)populate subsystem files */
				478	if (!ret)
				479	cgroup_populate_dir(cont);
				480
				481	out_unlock:
				482	mutex_unlock(&cgroup_mutex);
				483	mutex_unlock(&cont->dentry->d_inode->i_mutex);
				484	return ret;
				485	}
				486
				487	static struct super_operations cgroup_ops = {
				488	.statfs = simple_statfs,
				489	.drop_inode = generic_delete_inode,
				490	.show_options = cgroup_show_options,
				491	.remount_fs = cgroup_remount,
				492	};
				493
				494	static void init_cgroup_root(struct cgroupfs_root *root)
				495	{
				496	struct cgroup *cont = &root->top_cgroup;
				497	INIT_LIST_HEAD(&root->subsys_list);
				498	INIT_LIST_HEAD(&root->root_list);
				499	root->number_of_cgroups = 1;
				500	cont->root = root;
				501	cont->top_cgroup = cont;
				502	INIT_LIST_HEAD(&cont->sibling);
				503	INIT_LIST_HEAD(&cont->children);
				504	}
				505
				506	static int cgroup_test_super(struct super_block sb, void data)
				507	{
				508	struct cgroupfs_root *new = data;
				509	struct cgroupfs_root *root = sb->s_fs_info;
				510
				511	/* First check subsystems */
				512	if (new->subsys_bits != root->subsys_bits)
				513	return 0;
				514
				515	/* Next check flags */
				516	if (new->flags != root->flags)
				517	return 0;
				518
				519	return 1;
				520	}
				521
				522	static int cgroup_set_super(struct super_block sb, void data)
				523	{
				524	int ret;
				525	struct cgroupfs_root *root = data;
				526
				527	ret = set_anon_super(sb, NULL);
				528	if (ret)
				529	return ret;
				530
				531	sb->s_fs_info = root;
				532	root->sb = sb;
				533
				534	sb->s_blocksize = PAGE_CACHE_SIZE;
				535	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
				536	sb->s_magic = CGROUP_SUPER_MAGIC;
				537	sb->s_op = &cgroup_ops;
				538
				539	return 0;
				540	}
				541
				542	static int cgroup_get_rootdir(struct super_block *sb)
				543	{
				544	struct inode *inode =
				545	cgroup_new_inode(S_IFDIR \| S_IRUGO \| S_IXUGO \| S_IWUSR, sb);
				546	struct dentry *dentry;
				547
				548	if (!inode)
				549	return -ENOMEM;
				550
				551	inode->i_op = &simple_dir_inode_operations;
				552	inode->i_fop = &simple_dir_operations;
				553	inode->i_op = &cgroup_dir_inode_operations;
				554	/* directories start off with i_nlink == 2 (for "." entry) */
				555	inc_nlink(inode);
				556	dentry = d_alloc_root(inode);
				557	if (!dentry) {
				558	iput(inode);
				559	return -ENOMEM;
				560	}
				561	sb->s_root = dentry;
				562	return 0;
				563	}
				564
				565	static int cgroup_get_sb(struct file_system_type *fs_type,
				566	int flags, const char *unused_dev_name,
				567	void data, struct vfsmount mnt)
				568	{
				569	struct cgroup_sb_opts opts;
				570	int ret = 0;
				571	struct super_block *sb;
				572	struct cgroupfs_root *root;
				573
				574	/* First find the desired set of subsystems */
				575	ret = parse_cgroupfs_options(data, &opts);
				576	if (ret)
				577	return ret;
				578
				579	root = kzalloc(sizeof(*root), GFP_KERNEL);
				580	if (!root)
				581	return -ENOMEM;
				582
				583	init_cgroup_root(root);
				584	root->subsys_bits = opts.subsys_bits;
				585	root->flags = opts.flags;
				586
				587	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
				588
				589	if (IS_ERR(sb)) {
				590	kfree(root);
				591	return PTR_ERR(sb);
				592	}
				593
				594	if (sb->s_fs_info != root) {
				595	/* Reusing an existing superblock */
				596	BUG_ON(sb->s_root == NULL);
				597	kfree(root);
				598	root = NULL;
				599	} else {
				600	/* New superblock */
				601	struct cgroup *cont = &root->top_cgroup;
				602
				603	BUG_ON(sb->s_root != NULL);
				604
				605	ret = cgroup_get_rootdir(sb);
				606	if (ret)
				607	goto drop_new_super;
				608
				609	mutex_lock(&cgroup_mutex);
				610
				611	ret = rebind_subsystems(root, root->subsys_bits);
				612	if (ret == -EBUSY) {
				613	mutex_unlock(&cgroup_mutex);
				614	goto drop_new_super;
				615	}
				616
				617	/* EBUSY should be the only error here */
				618	BUG_ON(ret);
				619
				620	list_add(&root->root_list, &roots);
				621
				622	sb->s_root->d_fsdata = &root->top_cgroup;
				623	root->top_cgroup.dentry = sb->s_root;
				624
				625	BUG_ON(!list_empty(&cont->sibling));
				626	BUG_ON(!list_empty(&cont->children));
				627	BUG_ON(root->number_of_cgroups != 1);
				628
				629	/*
				630	* I believe that it's safe to nest i_mutex inside
				631	* cgroup_mutex in this case, since no-one else can
				632	* be accessing this directory yet. But we still need
				633	* to teach lockdep that this is the case - currently
				634	* a cgroupfs remount triggers a lockdep warning
				635	*/
				636	mutex_lock(&cont->dentry->d_inode->i_mutex);
				637	cgroup_populate_dir(cont);
				638	mutex_unlock(&cont->dentry->d_inode->i_mutex);
				639	mutex_unlock(&cgroup_mutex);
				640	}
				641
				642	return simple_set_mnt(mnt, sb);
				643
				644	drop_new_super:
				645	up_write(&sb->s_umount);
				646	deactivate_super(sb);
				647	return ret;
				648	}
				649
				650	static void cgroup_kill_sb(struct super_block *sb) {
				651	struct cgroupfs_root *root = sb->s_fs_info;
				652	struct cgroup *cont = &root->top_cgroup;
				653	int ret;
				654
				655	BUG_ON(!root);
				656
				657	BUG_ON(root->number_of_cgroups != 1);
				658	BUG_ON(!list_empty(&cont->children));
				659	BUG_ON(!list_empty(&cont->sibling));
				660
				661	mutex_lock(&cgroup_mutex);
				662
				663	/* Rebind all subsystems back to the default hierarchy */
				664	ret = rebind_subsystems(root, 0);
				665	/* Shouldn't be able to fail ... */
				666	BUG_ON(ret);
				667
				668	if (!list_empty(&root->root_list))
				669	list_del(&root->root_list);
				670	mutex_unlock(&cgroup_mutex);
				671
				672	kfree(root);
				673	kill_litter_super(sb);
				674	}
				675
				676	static struct file_system_type cgroup_fs_type = {
				677	.name = "cgroup",
				678	.get_sb = cgroup_get_sb,
				679	.kill_sb = cgroup_kill_sb,
				680	};
				681
				682	static inline struct cgroup __d_cont(struct dentry dentry)
				683	{
				684	return dentry->d_fsdata;
				685	}
				686
				687	static inline struct cftype __d_cft(struct dentry dentry)
				688	{
				689	return dentry->d_fsdata;
				690	}
				691
				692	/*
				693	* Called with cgroup_mutex held. Writes path of cgroup into buf.
				694	* Returns 0 on success, -errno on error.
				695	*/
				696	int cgroup_path(const struct cgroup cont, char buf, int buflen)
				697	{
				698	char *start;
				699
				700	if (cont == dummytop) {
				701	/*
				702	* Inactive subsystems have no dentry for their root
				703	* cgroup
				704	*/
				705	strcpy(buf, "/");
				706	return 0;
				707	}
				708
				709	start = buf + buflen;
				710
				711	*--start = '\0';
				712	for (;;) {
				713	int len = cont->dentry->d_name.len;
				714	if ((start -= len) < buf)
				715	return -ENAMETOOLONG;
				716	memcpy(start, cont->dentry->d_name.name, len);
				717	cont = cont->parent;
				718	if (!cont)
				719	break;
				720	if (!cont->parent)
				721	continue;
				722	if (--start < buf)
				723	return -ENAMETOOLONG;
				724	*start = '/';
				725	}
				726	memmove(buf, start, buf + buflen - start);
				727	return 0;
				728	}
				729
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	730	/*
				731	* Return the first subsystem attached to a cgroup's hierarchy, and
				732	* its subsystem id.
				733	*/
				734
				735	static void get_first_subsys(const struct cgroup *cont,
				736	struct cgroup_subsys_state *css, int subsys_id)
				737	{
				738	const struct cgroupfs_root *root = cont->root;
				739	const struct cgroup_subsys *test_ss;
				740	BUG_ON(list_empty(&root->subsys_list));
				741	test_ss = list_entry(root->subsys_list.next,
				742	struct cgroup_subsys, sibling);
				743	if (css) {
				744	*css = cont->subsys[test_ss->subsys_id];
				745	BUG_ON(!*css);
				746	}
				747	if (subsys_id)
				748	*subsys_id = test_ss->subsys_id;
				749	}
				750
				751	/*
				752	* Attach task 'tsk' to cgroup 'cont'
				753	*
				754	* Call holding cgroup_mutex. May take task_lock of
				755	* the task 'pid' during call.
				756	*/
				757	static int attach_task(struct cgroup cont, struct task_struct tsk)
				758	{
				759	int retval = 0;
				760	struct cgroup_subsys *ss;
				761	struct cgroup *oldcont;
				762	struct css_set *cg = &tsk->cgroups;
				763	struct cgroupfs_root *root = cont->root;
				764	int i;
				765	int subsys_id;
				766
				767	get_first_subsys(cont, NULL, &subsys_id);
				768
				769	/* Nothing to do if the task is already in that cgroup */
				770	oldcont = task_cgroup(tsk, subsys_id);
				771	if (cont == oldcont)
				772	return 0;
				773
				774	for_each_subsys(root, ss) {
				775	if (ss->can_attach) {
				776	retval = ss->can_attach(ss, cont, tsk);
				777	if (retval) {
				778	return retval;
				779	}
				780	}
				781	}
				782
				783	task_lock(tsk);
				784	if (tsk->flags & PF_EXITING) {
				785	task_unlock(tsk);
				786	return -ESRCH;
				787	}
				788	/* Update the css_set pointers for the subsystems in this
				789	* hierarchy */
				790	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				791	if (root->subsys_bits & (1ull << i)) {
				792	/* Subsystem is in this hierarchy. So we want
				793	* the subsystem state from the new
				794	* cgroup. Transfer the refcount from the
				795	* old to the new */
				796	atomic_inc(&cont->count);
				797	atomic_dec(&cg->subsys[i]->cgroup->count);
				798	rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
				799	}
				800	}
				801	task_unlock(tsk);
				802
				803	for_each_subsys(root, ss) {
				804	if (ss->attach) {
				805	ss->attach(ss, cont, oldcont, tsk);
				806	}
				807	}
				808
				809	synchronize_rcu();
				810	return 0;
				811	}
				812
				813	/*
				814	* Attach task with pid 'pid' to cgroup 'cont'. Call with
				815	* cgroup_mutex, may take task_lock of task
				816	*/
				817	static int attach_task_by_pid(struct cgroup cont, char pidbuf)
				818	{
				819	pid_t pid;
				820	struct task_struct *tsk;
				821	int ret;
				822
				823	if (sscanf(pidbuf, "%d", &pid) != 1)
				824	return -EIO;
				825
				826	if (pid) {
				827	rcu_read_lock();
				828	tsk = find_task_by_pid(pid);
				829	if (!tsk \|\| tsk->flags & PF_EXITING) {
				830	rcu_read_unlock();
				831	return -ESRCH;
				832	}
				833	get_task_struct(tsk);
				834	rcu_read_unlock();
				835
				836	if ((current->euid) && (current->euid != tsk->uid)
				837	&& (current->euid != tsk->suid)) {
				838	put_task_struct(tsk);
				839	return -EACCES;
				840	}
				841	} else {
				842	tsk = current;
				843	get_task_struct(tsk);
				844	}
				845
				846	ret = attach_task(cont, tsk);
				847	put_task_struct(tsk);
				848	return ret;
				849	}
				850
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	851	/* The various types of files and directories in a cgroup file system */
				852
				853	enum cgroup_filetype {
				854	FILE_ROOT,
				855	FILE_DIR,
				856	FILE_TASKLIST,
				857	};
				858
Paul Menage	355e0c4	2007-10-18 23:39:33 -0700	[diff] [blame]	859	static ssize_t cgroup_write_uint(struct cgroup cont, struct cftype cft,
				860	struct file *file,
				861	const char __user *userbuf,
				862	size_t nbytes, loff_t *unused_ppos)
				863	{
				864	char buffer[64];
				865	int retval = 0;
				866	u64 val;
				867	char *end;
				868
				869	if (!nbytes)
				870	return -EINVAL;
				871	if (nbytes >= sizeof(buffer))
				872	return -E2BIG;
				873	if (copy_from_user(buffer, userbuf, nbytes))
				874	return -EFAULT;
				875
				876	buffer[nbytes] = 0; /* nul-terminate */
				877
				878	/* strip newline if necessary */
				879	if (nbytes && (buffer[nbytes-1] == '\n'))
				880	buffer[nbytes-1] = 0;
				881	val = simple_strtoull(buffer, &end, 0);
				882	if (*end)
				883	return -EINVAL;
				884
				885	/* Pass to subsystem */
				886	retval = cft->write_uint(cont, cft, val);
				887	if (!retval)
				888	retval = nbytes;
				889	return retval;
				890	}
				891
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	892	static ssize_t cgroup_common_file_write(struct cgroup *cont,
				893	struct cftype *cft,
				894	struct file *file,
				895	const char __user *userbuf,
				896	size_t nbytes, loff_t *unused_ppos)
				897	{
				898	enum cgroup_filetype type = cft->private;
				899	char *buffer;
				900	int retval = 0;
				901
				902	if (nbytes >= PATH_MAX)
				903	return -E2BIG;
				904
				905	/* +1 for nul-terminator */
				906	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
				907	if (buffer == NULL)
				908	return -ENOMEM;
				909
				910	if (copy_from_user(buffer, userbuf, nbytes)) {
				911	retval = -EFAULT;
				912	goto out1;
				913	}
				914	buffer[nbytes] = 0; /* nul-terminate */
				915
				916	mutex_lock(&cgroup_mutex);
				917
				918	if (cgroup_is_removed(cont)) {
				919	retval = -ENODEV;
				920	goto out2;
				921	}
				922
				923	switch (type) {
				924	case FILE_TASKLIST:
				925	retval = attach_task_by_pid(cont, buffer);
				926	break;
				927	default:
				928	retval = -EINVAL;
				929	goto out2;
				930	}
				931
				932	if (retval == 0)
				933	retval = nbytes;
				934	out2:
				935	mutex_unlock(&cgroup_mutex);
				936	out1:
				937	kfree(buffer);
				938	return retval;
				939	}
				940
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	941	static ssize_t cgroup_file_write(struct file file, const char __user buf,
				942	size_t nbytes, loff_t *ppos)
				943	{
				944	struct cftype *cft = __d_cft(file->f_dentry);
				945	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
				946
				947	if (!cft)
				948	return -ENODEV;
Paul Menage	355e0c4	2007-10-18 23:39:33 -0700	[diff] [blame]	949	if (cft->write)
				950	return cft->write(cont, cft, file, buf, nbytes, ppos);
				951	if (cft->write_uint)
				952	return cgroup_write_uint(cont, cft, file, buf, nbytes, ppos);
				953	return -EINVAL;
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	954	}
				955
				956	static ssize_t cgroup_read_uint(struct cgroup cont, struct cftype cft,
				957	struct file *file,
				958	char __user *buf, size_t nbytes,
				959	loff_t *ppos)
				960	{
				961	char tmp[64];
				962	u64 val = cft->read_uint(cont, cft);
				963	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
				964
				965	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
				966	}
				967
				968	static ssize_t cgroup_file_read(struct file file, char __user buf,
				969	size_t nbytes, loff_t *ppos)
				970	{
				971	struct cftype *cft = __d_cft(file->f_dentry);
				972	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
				973
				974	if (!cft)
				975	return -ENODEV;
				976
				977	if (cft->read)
				978	return cft->read(cont, cft, file, buf, nbytes, ppos);
				979	if (cft->read_uint)
				980	return cgroup_read_uint(cont, cft, file, buf, nbytes, ppos);
				981	return -EINVAL;
				982	}
				983
				984	static int cgroup_file_open(struct inode inode, struct file file)
				985	{
				986	int err;
				987	struct cftype *cft;
				988
				989	err = generic_file_open(inode, file);
				990	if (err)
				991	return err;
				992
				993	cft = __d_cft(file->f_dentry);
				994	if (!cft)
				995	return -ENODEV;
				996	if (cft->open)
				997	err = cft->open(inode, file);
				998	else
				999	err = 0;
				1000
				1001	return err;
				1002	}
				1003
				1004	static int cgroup_file_release(struct inode inode, struct file file)
				1005	{
				1006	struct cftype *cft = __d_cft(file->f_dentry);
				1007	if (cft->release)
				1008	return cft->release(inode, file);
				1009	return 0;
				1010	}
				1011
				1012	/*
				1013	* cgroup_rename - Only allow simple rename of directories in place.
				1014	*/
				1015	static int cgroup_rename(struct inode old_dir, struct dentry old_dentry,
				1016	struct inode new_dir, struct dentry new_dentry)
				1017	{
				1018	if (!S_ISDIR(old_dentry->d_inode->i_mode))
				1019	return -ENOTDIR;
				1020	if (new_dentry->d_inode)
				1021	return -EEXIST;
				1022	if (old_dir != new_dir)
				1023	return -EIO;
				1024	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
				1025	}
				1026
				1027	static struct file_operations cgroup_file_operations = {
				1028	.read = cgroup_file_read,
				1029	.write = cgroup_file_write,
				1030	.llseek = generic_file_llseek,
				1031	.open = cgroup_file_open,
				1032	.release = cgroup_file_release,
				1033	};
				1034
				1035	static struct inode_operations cgroup_dir_inode_operations = {
				1036	.lookup = simple_lookup,
				1037	.mkdir = cgroup_mkdir,
				1038	.rmdir = cgroup_rmdir,
				1039	.rename = cgroup_rename,
				1040	};
				1041
				1042	static int cgroup_create_file(struct dentry *dentry, int mode,
				1043	struct super_block *sb)
				1044	{
				1045	static struct dentry_operations cgroup_dops = {
				1046	.d_iput = cgroup_diput,
				1047	};
				1048
				1049	struct inode *inode;
				1050
				1051	if (!dentry)
				1052	return -ENOENT;
				1053	if (dentry->d_inode)
				1054	return -EEXIST;
				1055
				1056	inode = cgroup_new_inode(mode, sb);
				1057	if (!inode)
				1058	return -ENOMEM;
				1059
				1060	if (S_ISDIR(mode)) {
				1061	inode->i_op = &cgroup_dir_inode_operations;
				1062	inode->i_fop = &simple_dir_operations;
				1063
				1064	/* start off with i_nlink == 2 (for "." entry) */
				1065	inc_nlink(inode);
				1066
				1067	/* start with the directory inode held, so that we can
				1068	* populate it without racing with another mkdir */
				1069	mutex_lock(&inode->i_mutex);
				1070	} else if (S_ISREG(mode)) {
				1071	inode->i_size = 0;
				1072	inode->i_fop = &cgroup_file_operations;
				1073	}
				1074	dentry->d_op = &cgroup_dops;
				1075	d_instantiate(dentry, inode);
				1076	dget(dentry); /* Extra count - pin the dentry in core */
				1077	return 0;
				1078	}
				1079
				1080	/*
				1081	* cgroup_create_dir - create a directory for an object.
				1082	* cont: the cgroup we create the directory for.
				1083	* It must have a valid ->parent field
				1084	* And we are going to fill its ->dentry field.
				1085	* dentry: dentry of the new container
				1086	* mode: mode to set on new directory.
				1087	*/
				1088	static int cgroup_create_dir(struct cgroup cont, struct dentry dentry,
				1089	int mode)
				1090	{
				1091	struct dentry *parent;
				1092	int error = 0;
				1093
				1094	parent = cont->parent->dentry;
				1095	error = cgroup_create_file(dentry, S_IFDIR \| mode, cont->root->sb);
				1096	if (!error) {
				1097	dentry->d_fsdata = cont;
				1098	inc_nlink(parent->d_inode);
				1099	cont->dentry = dentry;
				1100	dget(dentry);
				1101	}
				1102	dput(dentry);
				1103
				1104	return error;
				1105	}
				1106
				1107	int cgroup_add_file(struct cgroup *cont,
				1108	struct cgroup_subsys *subsys,
				1109	const struct cftype *cft)
				1110	{
				1111	struct dentry *dir = cont->dentry;
				1112	struct dentry *dentry;
				1113	int error;
				1114
				1115	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
				1116	if (subsys && !test_bit(ROOT_NOPREFIX, &cont->root->flags)) {
				1117	strcpy(name, subsys->name);
				1118	strcat(name, ".");
				1119	}
				1120	strcat(name, cft->name);
				1121	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
				1122	dentry = lookup_one_len(name, dir, strlen(name));
				1123	if (!IS_ERR(dentry)) {
				1124	error = cgroup_create_file(dentry, 0644 \| S_IFREG,
				1125	cont->root->sb);
				1126	if (!error)
				1127	dentry->d_fsdata = (void *)cft;
				1128	dput(dentry);
				1129	} else
				1130	error = PTR_ERR(dentry);
				1131	return error;
				1132	}
				1133
				1134	int cgroup_add_files(struct cgroup *cont,
				1135	struct cgroup_subsys *subsys,
				1136	const struct cftype cft[],
				1137	int count)
				1138	{
				1139	int i, err;
				1140	for (i = 0; i < count; i++) {
				1141	err = cgroup_add_file(cont, subsys, &cft[i]);
				1142	if (err)
				1143	return err;
				1144	}
				1145	return 0;
				1146	}
				1147
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1148	/* Count the number of tasks in a cgroup. Could be made more
				1149	* time-efficient but less space-efficient with more linked lists
				1150	* running through each cgroup and the css_set structures that
				1151	* referenced it. Must be called with tasklist_lock held for read or
				1152	* write or in an rcu critical section.
				1153	*/
				1154	int __cgroup_task_count(const struct cgroup *cont)
				1155	{
				1156	int count = 0;
				1157	struct task_struct g, p;
				1158	struct cgroup_subsys_state *css;
				1159	int subsys_id;
				1160
				1161	get_first_subsys(cont, &css, &subsys_id);
				1162	do_each_thread(g, p) {
				1163	if (task_subsys_state(p, subsys_id) == css)
				1164	count ++;
				1165	} while_each_thread(g, p);
				1166	return count;
				1167	}
				1168
				1169	/*
				1170	* Stuff for reading the 'tasks' file.
				1171	*
				1172	* Reading this file can return large amounts of data if a cgroup has
				1173	* lots of attached tasks. So it may need several calls to read(),
				1174	* but we cannot guarantee that the information we produce is correct
				1175	* unless we produce it entirely atomically.
				1176	*
				1177	* Upon tasks file open(), a struct ctr_struct is allocated, that
				1178	* will have a pointer to an array (also allocated here). The struct
				1179	* ctr_struct * is stored in file->private_data. Its resources will
				1180	* be freed by release() when the file is closed. The array is used
				1181	* to sprintf the PIDs and then used by read().
				1182	*/
				1183	struct ctr_struct {
				1184	char *buf;
				1185	int bufsz;
				1186	};
				1187
				1188	/*
				1189	* Load into 'pidarray' up to 'npids' of the tasks using cgroup
				1190	* 'cont'. Return actual number of pids loaded. No need to
				1191	* task_lock(p) when reading out p->cgroup, since we're in an RCU
				1192	* read section, so the css_set can't go away, and is
				1193	* immutable after creation.
				1194	*/
				1195	static int pid_array_load(pid_t pidarray, int npids, struct cgroup cont)
				1196	{
				1197	int n = 0;
				1198	struct task_struct g, p;
				1199	struct cgroup_subsys_state *css;
				1200	int subsys_id;
				1201
				1202	get_first_subsys(cont, &css, &subsys_id);
				1203	rcu_read_lock();
				1204	do_each_thread(g, p) {
				1205	if (task_subsys_state(p, subsys_id) == css) {
				1206	pidarray[n++] = pid_nr(task_pid(p));
				1207	if (unlikely(n == npids))
				1208	goto array_full;
				1209	}
				1210	} while_each_thread(g, p);
				1211
				1212	array_full:
				1213	rcu_read_unlock();
				1214	return n;
				1215	}
				1216
				1217	static int cmppid(const void a, const void b)
				1218	{
				1219	return (pid_t )a - (pid_t )b;
				1220	}
				1221
				1222	/*
				1223	* Convert array 'a' of 'npids' pid_t's to a string of newline separated
				1224	* decimal pids in 'buf'. Don't write more than 'sz' chars, but return
				1225	* count 'cnt' of how many chars would be written if buf were large enough.
				1226	*/
				1227	static int pid_array_to_buf(char buf, int sz, pid_t a, int npids)
				1228	{
				1229	int cnt = 0;
				1230	int i;
				1231
				1232	for (i = 0; i < npids; i++)
				1233	cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
				1234	return cnt;
				1235	}
				1236
				1237	/*
				1238	* Handle an open on 'tasks' file. Prepare a buffer listing the
				1239	* process id's of tasks currently attached to the cgroup being opened.
				1240	*
				1241	* Does not require any specific cgroup mutexes, and does not take any.
				1242	*/
				1243	static int cgroup_tasks_open(struct inode unused, struct file file)
				1244	{
				1245	struct cgroup *cont = __d_cont(file->f_dentry->d_parent);
				1246	struct ctr_struct *ctr;
				1247	pid_t *pidarray;
				1248	int npids;
				1249	char c;
				1250
				1251	if (!(file->f_mode & FMODE_READ))
				1252	return 0;
				1253
				1254	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
				1255	if (!ctr)
				1256	goto err0;
				1257
				1258	/*
				1259	* If cgroup gets more users after we read count, we won't have
				1260	* enough space - tough. This race is indistinguishable to the
				1261	* caller from the case that the additional cgroup users didn't
				1262	* show up until sometime later on.
				1263	*/
				1264	npids = cgroup_task_count(cont);
				1265	if (npids) {
				1266	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
				1267	if (!pidarray)
				1268	goto err1;
				1269
				1270	npids = pid_array_load(pidarray, npids, cont);
				1271	sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
				1272
				1273	/* Call pid_array_to_buf() twice, first just to get bufsz */
				1274	ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
				1275	ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
				1276	if (!ctr->buf)
				1277	goto err2;
				1278	ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
				1279
				1280	kfree(pidarray);
				1281	} else {
				1282	ctr->buf = 0;
				1283	ctr->bufsz = 0;
				1284	}
				1285	file->private_data = ctr;
				1286	return 0;
				1287
				1288	err2:
				1289	kfree(pidarray);
				1290	err1:
				1291	kfree(ctr);
				1292	err0:
				1293	return -ENOMEM;
				1294	}
				1295
				1296	static ssize_t cgroup_tasks_read(struct cgroup *cont,
				1297	struct cftype *cft,
				1298	struct file file, char __user buf,
				1299	size_t nbytes, loff_t *ppos)
				1300	{
				1301	struct ctr_struct *ctr = file->private_data;
				1302
				1303	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
				1304	}
				1305
				1306	static int cgroup_tasks_release(struct inode *unused_inode,
				1307	struct file *file)
				1308	{
				1309	struct ctr_struct *ctr;
				1310
				1311	if (file->f_mode & FMODE_READ) {
				1312	ctr = file->private_data;
				1313	kfree(ctr->buf);
				1314	kfree(ctr);
				1315	}
				1316	return 0;
				1317	}
				1318
				1319	/*
				1320	* for the common functions, 'private' gives the type of file
				1321	*/
				1322	static struct cftype cft_tasks = {
				1323	.name = "tasks",
				1324	.open = cgroup_tasks_open,
				1325	.read = cgroup_tasks_read,
				1326	.write = cgroup_common_file_write,
				1327	.release = cgroup_tasks_release,
				1328	.private = FILE_TASKLIST,
				1329	};
				1330
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1331	static int cgroup_populate_dir(struct cgroup *cont)
				1332	{
				1333	int err;
				1334	struct cgroup_subsys *ss;
				1335
				1336	/* First clear out any existing files */
				1337	cgroup_clear_directory(cont->dentry);
				1338
Paul Menage	bbcb81d	2007-10-18 23:39:32 -0700	[diff] [blame]	1339	err = cgroup_add_file(cont, NULL, &cft_tasks);
				1340	if (err < 0)
				1341	return err;
				1342
Paul Menage	ddbcc7e	2007-10-18 23:39:30 -0700	[diff] [blame]	1343	for_each_subsys(cont->root, ss) {
				1344	if (ss->populate && (err = ss->populate(ss, cont)) < 0)
				1345	return err;
				1346	}
				1347
				1348	return 0;
				1349	}
				1350
				1351	static void init_cgroup_css(struct cgroup_subsys_state *css,
				1352	struct cgroup_subsys *ss,
				1353	struct cgroup *cont)
				1354	{
				1355	css->cgroup = cont;
				1356	atomic_set(&css->refcnt, 0);
				1357	css->flags = 0;
				1358	if (cont == dummytop)
				1359	set_bit(CSS_ROOT, &css->flags);
				1360	BUG_ON(cont->subsys[ss->subsys_id]);
				1361	cont->subsys[ss->subsys_id] = css;
				1362	}
				1363
				1364	/*
				1365	* cgroup_create - create a cgroup
				1366	* parent: cgroup that will be parent of the new cgroup.
				1367	* name: name of the new cgroup. Will be strcpy'ed.
				1368	* mode: mode to set on new inode
				1369	*
				1370	* Must be called with the mutex on the parent inode held
				1371	*/
				1372
				1373	static long cgroup_create(struct cgroup parent, struct dentry dentry,
				1374	int mode)
				1375	{
				1376	struct cgroup *cont;
				1377	struct cgroupfs_root *root = parent->root;
				1378	int err = 0;
				1379	struct cgroup_subsys *ss;
				1380	struct super_block *sb = root->sb;
				1381
				1382	cont = kzalloc(sizeof(*cont), GFP_KERNEL);
				1383	if (!cont)
				1384	return -ENOMEM;
				1385
				1386	/* Grab a reference on the superblock so the hierarchy doesn't
				1387	* get deleted on unmount if there are child cgroups. This
				1388	* can be done outside cgroup_mutex, since the sb can't
				1389	* disappear while someone has an open control file on the
				1390	* fs */
				1391	atomic_inc(&sb->s_active);
				1392
				1393	mutex_lock(&cgroup_mutex);
				1394
				1395	cont->flags = 0;
				1396	INIT_LIST_HEAD(&cont->sibling);
				1397	INIT_LIST_HEAD(&cont->children);
				1398
				1399	cont->parent = parent;
				1400	cont->root = parent->root;
				1401	cont->top_cgroup = parent->top_cgroup;
				1402
				1403	for_each_subsys(root, ss) {
				1404	struct cgroup_subsys_state *css = ss->create(ss, cont);
				1405	if (IS_ERR(css)) {
				1406	err = PTR_ERR(css);
				1407	goto err_destroy;
				1408	}
				1409	init_cgroup_css(css, ss, cont);
				1410	}
				1411
				1412	list_add(&cont->sibling, &cont->parent->children);
				1413	root->number_of_cgroups++;
				1414
				1415	err = cgroup_create_dir(cont, dentry, mode);
				1416	if (err < 0)
				1417	goto err_remove;
				1418
				1419	/* The cgroup directory was pre-locked for us */
				1420	BUG_ON(!mutex_is_locked(&cont->dentry->d_inode->i_mutex));
				1421
				1422	err = cgroup_populate_dir(cont);
				1423	/* If err < 0, we have a half-filled directory - oh well ;) */
				1424
				1425	mutex_unlock(&cgroup_mutex);
				1426	mutex_unlock(&cont->dentry->d_inode->i_mutex);
				1427
				1428	return 0;
				1429
				1430	err_remove:
				1431
				1432	list_del(&cont->sibling);
				1433	root->number_of_cgroups--;
				1434
				1435	err_destroy:
				1436
				1437	for_each_subsys(root, ss) {
				1438	if (cont->subsys[ss->subsys_id])
				1439	ss->destroy(ss, cont);
				1440	}
				1441
				1442	mutex_unlock(&cgroup_mutex);
				1443
				1444	/* Release the reference count that we took on the superblock */
				1445	deactivate_super(sb);
				1446
				1447	kfree(cont);
				1448	return err;
				1449	}
				1450
				1451	static int cgroup_mkdir(struct inode dir, struct dentry dentry, int mode)
				1452	{
				1453	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
				1454
				1455	/* the vfs holds inode->i_mutex already */
				1456	return cgroup_create(c_parent, dentry, mode \| S_IFDIR);
				1457	}
				1458
				1459	static int cgroup_rmdir(struct inode unused_dir, struct dentry dentry)
				1460	{
				1461	struct cgroup *cont = dentry->d_fsdata;
				1462	struct dentry *d;
				1463	struct cgroup *parent;
				1464	struct cgroup_subsys *ss;
				1465	struct super_block *sb;
				1466	struct cgroupfs_root *root;
				1467	int css_busy = 0;
				1468
				1469	/* the vfs holds both inode->i_mutex already */
				1470
				1471	mutex_lock(&cgroup_mutex);
				1472	if (atomic_read(&cont->count) != 0) {
				1473	mutex_unlock(&cgroup_mutex);
				1474	return -EBUSY;
				1475	}
				1476	if (!list_empty(&cont->children)) {
				1477	mutex_unlock(&cgroup_mutex);
				1478	return -EBUSY;
				1479	}
				1480
				1481	parent = cont->parent;
				1482	root = cont->root;
				1483	sb = root->sb;
				1484
				1485	/* Check the reference count on each subsystem. Since we
				1486	* already established that there are no tasks in the
				1487	* cgroup, if the css refcount is also 0, then there should
				1488	* be no outstanding references, so the subsystem is safe to
				1489	* destroy */
				1490	for_each_subsys(root, ss) {
				1491	struct cgroup_subsys_state *css;
				1492	css = cont->subsys[ss->subsys_id];
				1493	if (atomic_read(&css->refcnt)) {
				1494	css_busy = 1;
				1495	break;
				1496	}
				1497	}
				1498	if (css_busy) {
				1499	mutex_unlock(&cgroup_mutex);
				1500	return -EBUSY;
				1501	}
				1502
				1503	for_each_subsys(root, ss) {
				1504	if (cont->subsys[ss->subsys_id])
				1505	ss->destroy(ss, cont);
				1506	}
				1507
				1508	set_bit(CONT_REMOVED, &cont->flags);
				1509	/* delete my sibling from parent->children */
				1510	list_del(&cont->sibling);
				1511	spin_lock(&cont->dentry->d_lock);
				1512	d = dget(cont->dentry);
				1513	cont->dentry = NULL;
				1514	spin_unlock(&d->d_lock);
				1515
				1516	cgroup_d_remove_dir(d);
				1517	dput(d);
				1518	root->number_of_cgroups--;
				1519
				1520	mutex_unlock(&cgroup_mutex);
				1521	/* Drop the active superblock reference that we took when we
				1522	* created the cgroup */
				1523	deactivate_super(sb);
				1524	return 0;
				1525	}
				1526
				1527	static void cgroup_init_subsys(struct cgroup_subsys *ss)
				1528	{
				1529	struct task_struct g, p;
				1530	struct cgroup_subsys_state *css;
				1531	printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name);
				1532
				1533	/* Create the top cgroup state for this subsystem */
				1534	ss->root = &rootnode;
				1535	css = ss->create(ss, dummytop);
				1536	/* We don't handle early failures gracefully */
				1537	BUG_ON(IS_ERR(css));
				1538	init_cgroup_css(css, ss, dummytop);
				1539
				1540	/* Update all tasks to contain a subsys pointer to this state
				1541	* - since the subsystem is newly registered, all tasks are in
				1542	* the subsystem's top cgroup. */
				1543
				1544	/* If this subsystem requested that it be notified with fork
				1545	* events, we should send it one now for every process in the
				1546	* system */
				1547
				1548	read_lock(&tasklist_lock);
				1549	init_task.cgroups.subsys[ss->subsys_id] = css;
				1550	if (ss->fork)
				1551	ss->fork(ss, &init_task);
				1552
				1553	do_each_thread(g, p) {
				1554	printk(KERN_INFO "Setting task %p css to %p (%d)\n", css, p, p->pid);
				1555	p->cgroups.subsys[ss->subsys_id] = css;
				1556	if (ss->fork)
				1557	ss->fork(ss, p);
				1558	} while_each_thread(g, p);
				1559	read_unlock(&tasklist_lock);
				1560
				1561	need_forkexit_callback \|= ss->fork \|\| ss->exit;
				1562
				1563	ss->active = 1;
				1564	}
				1565
				1566	/**
				1567	* cgroup_init_early - initialize cgroups at system boot, and
				1568	* initialize any subsystems that request early init.
				1569	*/
				1570	int __init cgroup_init_early(void)
				1571	{
				1572	int i;
				1573	init_cgroup_root(&rootnode);
				1574	list_add(&rootnode.root_list, &roots);
				1575
				1576	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				1577	struct cgroup_subsys *ss = subsys[i];
				1578
				1579	BUG_ON(!ss->name);
				1580	BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
				1581	BUG_ON(!ss->create);
				1582	BUG_ON(!ss->destroy);
				1583	if (ss->subsys_id != i) {
				1584	printk(KERN_ERR "Subsys %s id == %d\n",
				1585	ss->name, ss->subsys_id);
				1586	BUG();
				1587	}
				1588
				1589	if (ss->early_init)
				1590	cgroup_init_subsys(ss);
				1591	}
				1592	return 0;
				1593	}
				1594
				1595	/**
				1596	* cgroup_init - register cgroup filesystem and /proc file, and
				1597	* initialize any subsystems that didn't request early init.
				1598	*/
				1599	int __init cgroup_init(void)
				1600	{
				1601	int err;
				1602	int i;
				1603
				1604	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				1605	struct cgroup_subsys *ss = subsys[i];
				1606	if (!ss->early_init)
				1607	cgroup_init_subsys(ss);
				1608	}
				1609
				1610	err = register_filesystem(&cgroup_fs_type);
				1611	if (err < 0)
				1612	goto out;
				1613
				1614	out:
				1615	return err;
				1616	}
Paul Menage	b4f48b6	2007-10-18 23:39:33 -0700	[diff] [blame^]	1617
				1618	/**
				1619	* cgroup_fork - attach newly forked task to its parents cgroup.
				1620	* @tsk: pointer to task_struct of forking parent process.
				1621	*
				1622	* Description: A task inherits its parent's cgroup at fork().
				1623	*
				1624	* A pointer to the shared css_set was automatically copied in
				1625	* fork.c by dup_task_struct(). However, we ignore that copy, since
				1626	* it was not made under the protection of RCU or cgroup_mutex, so
				1627	* might no longer be a valid cgroup pointer. attach_task() might
				1628	* have already changed current->cgroup, allowing the previously
				1629	* referenced cgroup to be removed and freed.
				1630	*
				1631	* At the point that cgroup_fork() is called, 'current' is the parent
				1632	* task, and the passed argument 'child' points to the child task.
				1633	*/
				1634	void cgroup_fork(struct task_struct *child)
				1635	{
				1636	rcu_read_lock();
				1637	child->cgroups = rcu_dereference(current->cgroups);
				1638	get_css_set(&child->cgroups);
				1639	rcu_read_unlock();
				1640	}
				1641
				1642	/**
				1643	* cgroup_fork_callbacks - called on a new task very soon before
				1644	* adding it to the tasklist. No need to take any locks since no-one
				1645	* can be operating on this task
				1646	*/
				1647	void cgroup_fork_callbacks(struct task_struct *child)
				1648	{
				1649	if (need_forkexit_callback) {
				1650	int i;
				1651	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				1652	struct cgroup_subsys *ss = subsys[i];
				1653	if (ss->fork)
				1654	ss->fork(ss, child);
				1655	}
				1656	}
				1657	}
				1658
				1659	/**
				1660	* cgroup_exit - detach cgroup from exiting task
				1661	* @tsk: pointer to task_struct of exiting process
				1662	*
				1663	* Description: Detach cgroup from @tsk and release it.
				1664	*
				1665	* Note that cgroups marked notify_on_release force every task in
				1666	* them to take the global cgroup_mutex mutex when exiting.
				1667	* This could impact scaling on very large systems. Be reluctant to
				1668	* use notify_on_release cgroups where very high task exit scaling
				1669	* is required on large systems.
				1670	*
				1671	* the_top_cgroup_hack:
				1672	*
				1673	* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
				1674	*
				1675	* We call cgroup_exit() while the task is still competent to
				1676	* handle notify_on_release(), then leave the task attached to the
				1677	* root cgroup in each hierarchy for the remainder of its exit.
				1678	*
				1679	* To do this properly, we would increment the reference count on
				1680	* top_cgroup, and near the very end of the kernel/exit.c do_exit()
				1681	* code we would add a second cgroup function call, to drop that
				1682	* reference. This would just create an unnecessary hot spot on
				1683	* the top_cgroup reference count, to no avail.
				1684	*
				1685	* Normally, holding a reference to a cgroup without bumping its
				1686	* count is unsafe. The cgroup could go away, or someone could
				1687	* attach us to a different cgroup, decrementing the count on
				1688	* the first cgroup that we never incremented. But in this case,
				1689	* top_cgroup isn't going away, and either task has PF_EXITING set,
				1690	* which wards off any attach_task() attempts, or task is a failed
				1691	* fork, never visible to attach_task.
				1692	*
				1693	*/
				1694	void cgroup_exit(struct task_struct *tsk, int run_callbacks)
				1695	{
				1696	int i;
				1697
				1698	if (run_callbacks && need_forkexit_callback) {
				1699	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
				1700	struct cgroup_subsys *ss = subsys[i];
				1701	if (ss->exit)
				1702	ss->exit(ss, tsk);
				1703	}
				1704	}
				1705	/* Reassign the task to the init_css_set. */
				1706	task_lock(tsk);
				1707	put_css_set(&tsk->cgroups);
				1708	tsk->cgroups = init_task.cgroups;
				1709	task_unlock(tsk);
				1710	}