blob: 4237b19e8fb3c43a0dcb16572a02cea76e221e69 [file] [log] [blame]
Serge E. Hallyn08ce5f12008-04-29 01:00:10 -07001/*
2 * dev_cgroup.c - device cgroup subsystem
3 *
4 * Copyright 2007 IBM Corp
5 */
6
7#include <linux/device_cgroup.h>
8#include <linux/cgroup.h>
9#include <linux/ctype.h>
10#include <linux/list.h>
11#include <linux/uaccess.h>
12
13#define ACC_MKNOD 1
14#define ACC_READ 2
15#define ACC_WRITE 4
16#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
17
18#define DEV_BLOCK 1
19#define DEV_CHAR 2
20#define DEV_ALL 4 /* this represents all devices */
21
22/*
23 * whitelist locking rules:
24 * cgroup_lock() cannot be taken under dev_cgroup->lock.
25 * dev_cgroup->lock can be taken with or without cgroup_lock().
26 *
27 * modifications always require cgroup_lock
28 * modifications to a list which is visible require the
29 * dev_cgroup->lock *and* cgroup_lock()
30 * walking the list requires dev_cgroup->lock or cgroup_lock().
31 *
32 * reasoning: dev_whitelist_copy() needs to kmalloc, so needs
33 * a mutex, which the cgroup_lock() is. Since modifying
34 * a visible list requires both locks, either lock can be
35 * taken for walking the list.
36 */
37
38struct dev_whitelist_item {
39 u32 major, minor;
40 short type;
41 short access;
42 struct list_head list;
43};
44
45struct dev_cgroup {
46 struct cgroup_subsys_state css;
47 struct list_head whitelist;
48 spinlock_t lock;
49};
50
51static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup)
52{
53 return container_of(cgroup_subsys_state(cgroup, devices_subsys_id),
54 struct dev_cgroup, css);
55}
56
57struct cgroup_subsys devices_subsys;
58
59static int devcgroup_can_attach(struct cgroup_subsys *ss,
60 struct cgroup *new_cgroup, struct task_struct *task)
61{
62 if (current != task && !capable(CAP_SYS_ADMIN))
63 return -EPERM;
64
65 return 0;
66}
67
68/*
69 * called under cgroup_lock()
70 */
71static int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
72{
73 struct dev_whitelist_item *wh, *tmp, *new;
74
75 list_for_each_entry(wh, orig, list) {
76 new = kmalloc(sizeof(*wh), GFP_KERNEL);
77 if (!new)
78 goto free_and_exit;
79 new->major = wh->major;
80 new->minor = wh->minor;
81 new->type = wh->type;
82 new->access = wh->access;
83 list_add_tail(&new->list, dest);
84 }
85
86 return 0;
87
88free_and_exit:
89 list_for_each_entry_safe(wh, tmp, dest, list) {
90 list_del(&wh->list);
91 kfree(wh);
92 }
93 return -ENOMEM;
94}
95
96/* Stupid prototype - don't bother combining existing entries */
97/*
98 * called under cgroup_lock()
99 * since the list is visible to other tasks, we need the spinlock also
100 */
101static int dev_whitelist_add(struct dev_cgroup *dev_cgroup,
102 struct dev_whitelist_item *wh)
103{
104 struct dev_whitelist_item *whcopy;
105
106 whcopy = kmalloc(sizeof(*whcopy), GFP_KERNEL);
107 if (!whcopy)
108 return -ENOMEM;
109
110 memcpy(whcopy, wh, sizeof(*whcopy));
111 spin_lock(&dev_cgroup->lock);
112 list_add_tail(&whcopy->list, &dev_cgroup->whitelist);
113 spin_unlock(&dev_cgroup->lock);
114 return 0;
115}
116
117/*
118 * called under cgroup_lock()
119 * since the list is visible to other tasks, we need the spinlock also
120 */
121static void dev_whitelist_rm(struct dev_cgroup *dev_cgroup,
122 struct dev_whitelist_item *wh)
123{
124 struct dev_whitelist_item *walk, *tmp;
125
126 spin_lock(&dev_cgroup->lock);
127 list_for_each_entry_safe(walk, tmp, &dev_cgroup->whitelist, list) {
128 if (walk->type == DEV_ALL)
129 goto remove;
130 if (walk->type != wh->type)
131 continue;
132 if (walk->major != ~0 && walk->major != wh->major)
133 continue;
134 if (walk->minor != ~0 && walk->minor != wh->minor)
135 continue;
136
137remove:
138 walk->access &= ~wh->access;
139 if (!walk->access) {
140 list_del(&walk->list);
141 kfree(walk);
142 }
143 }
144 spin_unlock(&dev_cgroup->lock);
145}
146
147/*
148 * called from kernel/cgroup.c with cgroup_lock() held.
149 */
150static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss,
151 struct cgroup *cgroup)
152{
153 struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
154 struct cgroup *parent_cgroup;
155 int ret;
156
157 dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
158 if (!dev_cgroup)
159 return ERR_PTR(-ENOMEM);
160 INIT_LIST_HEAD(&dev_cgroup->whitelist);
161 parent_cgroup = cgroup->parent;
162
163 if (parent_cgroup == NULL) {
164 struct dev_whitelist_item *wh;
165 wh = kmalloc(sizeof(*wh), GFP_KERNEL);
166 if (!wh) {
167 kfree(dev_cgroup);
168 return ERR_PTR(-ENOMEM);
169 }
170 wh->minor = wh->major = ~0;
171 wh->type = DEV_ALL;
172 wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE;
173 list_add(&wh->list, &dev_cgroup->whitelist);
174 } else {
175 parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup);
176 ret = dev_whitelist_copy(&dev_cgroup->whitelist,
177 &parent_dev_cgroup->whitelist);
178 if (ret) {
179 kfree(dev_cgroup);
180 return ERR_PTR(ret);
181 }
182 }
183
184 spin_lock_init(&dev_cgroup->lock);
185 return &dev_cgroup->css;
186}
187
188static void devcgroup_destroy(struct cgroup_subsys *ss,
189 struct cgroup *cgroup)
190{
191 struct dev_cgroup *dev_cgroup;
192 struct dev_whitelist_item *wh, *tmp;
193
194 dev_cgroup = cgroup_to_devcgroup(cgroup);
195 list_for_each_entry_safe(wh, tmp, &dev_cgroup->whitelist, list) {
196 list_del(&wh->list);
197 kfree(wh);
198 }
199 kfree(dev_cgroup);
200}
201
202#define DEVCG_ALLOW 1
203#define DEVCG_DENY 2
204
205static void set_access(char *acc, short access)
206{
207 int idx = 0;
208 memset(acc, 0, 4);
209 if (access & ACC_READ)
210 acc[idx++] = 'r';
211 if (access & ACC_WRITE)
212 acc[idx++] = 'w';
213 if (access & ACC_MKNOD)
214 acc[idx++] = 'm';
215}
216
217static char type_to_char(short type)
218{
219 if (type == DEV_ALL)
220 return 'a';
221 if (type == DEV_CHAR)
222 return 'c';
223 if (type == DEV_BLOCK)
224 return 'b';
225 return 'X';
226}
227
228static void set_majmin(char *str, int len, unsigned m)
229{
230 memset(str, 0, len);
231 if (m == ~0)
232 sprintf(str, "*");
233 else
234 snprintf(str, len, "%d", m);
235}
236
237static char *print_whitelist(struct dev_cgroup *devcgroup, int *len)
238{
239 char *buf, *s, acc[4];
240 struct dev_whitelist_item *wh;
241 int ret;
242 int count = 0;
243 char maj[10], min[10];
244
245 buf = kmalloc(4096, GFP_KERNEL);
246 if (!buf)
247 return ERR_PTR(-ENOMEM);
248 s = buf;
249 *s = '\0';
250 *len = 0;
251
252 spin_lock(&devcgroup->lock);
253 list_for_each_entry(wh, &devcgroup->whitelist, list) {
254 set_access(acc, wh->access);
255 set_majmin(maj, 10, wh->major);
256 set_majmin(min, 10, wh->minor);
257 ret = snprintf(s, 4095-(s-buf), "%c %s:%s %s\n",
258 type_to_char(wh->type), maj, min, acc);
259 if (s+ret >= buf+4095) {
260 kfree(buf);
261 buf = ERR_PTR(-ENOMEM);
262 break;
263 }
264 s += ret;
265 *len += ret;
266 count++;
267 }
268 spin_unlock(&devcgroup->lock);
269
270 return buf;
271}
272
273static ssize_t devcgroup_access_read(struct cgroup *cgroup,
274 struct cftype *cft, struct file *file,
275 char __user *userbuf, size_t nbytes, loff_t *ppos)
276{
277 struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
278 int filetype = cft->private;
279 char *buffer;
280 int uninitialized_var(len);
281 int retval;
282
283 if (filetype != DEVCG_ALLOW)
284 return -EINVAL;
285 buffer = print_whitelist(devcgroup, &len);
286 if (IS_ERR(buffer))
287 return PTR_ERR(buffer);
288
289 retval = simple_read_from_buffer(userbuf, nbytes, ppos, buffer, len);
290 kfree(buffer);
291 return retval;
292}
293
294/*
295 * may_access_whitelist:
296 * does the access granted to dev_cgroup c contain the access
297 * requested in whitelist item refwh.
298 * return 1 if yes, 0 if no.
299 * call with c->lock held
300 */
301static int may_access_whitelist(struct dev_cgroup *c,
302 struct dev_whitelist_item *refwh)
303{
304 struct dev_whitelist_item *whitem;
305
306 list_for_each_entry(whitem, &c->whitelist, list) {
307 if (whitem->type & DEV_ALL)
308 return 1;
309 if ((refwh->type & DEV_BLOCK) && !(whitem->type & DEV_BLOCK))
310 continue;
311 if ((refwh->type & DEV_CHAR) && !(whitem->type & DEV_CHAR))
312 continue;
313 if (whitem->major != ~0 && whitem->major != refwh->major)
314 continue;
315 if (whitem->minor != ~0 && whitem->minor != refwh->minor)
316 continue;
317 if (refwh->access & (~(whitem->access | ACC_MASK)))
318 continue;
319 return 1;
320 }
321 return 0;
322}
323
324/*
325 * parent_has_perm:
326 * when adding a new allow rule to a device whitelist, the rule
327 * must be allowed in the parent device
328 */
329static int parent_has_perm(struct cgroup *childcg,
330 struct dev_whitelist_item *wh)
331{
332 struct cgroup *pcg = childcg->parent;
333 struct dev_cgroup *parent;
334 int ret;
335
336 if (!pcg)
337 return 1;
338 parent = cgroup_to_devcgroup(pcg);
339 spin_lock(&parent->lock);
340 ret = may_access_whitelist(parent, wh);
341 spin_unlock(&parent->lock);
342 return ret;
343}
344
345/*
346 * Modify the whitelist using allow/deny rules.
347 * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD
348 * so we can give a container CAP_MKNOD to let it create devices but not
349 * modify the whitelist.
350 * It seems likely we'll want to add a CAP_CONTAINER capability to allow
351 * us to also grant CAP_SYS_ADMIN to containers without giving away the
352 * device whitelist controls, but for now we'll stick with CAP_SYS_ADMIN
353 *
354 * Taking rules away is always allowed (given CAP_SYS_ADMIN). Granting
355 * new access is only allowed if you're in the top-level cgroup, or your
356 * parent cgroup has the access you're asking for.
357 */
358static ssize_t devcgroup_access_write(struct cgroup *cgroup, struct cftype *cft,
359 struct file *file, const char __user *userbuf,
360 size_t nbytes, loff_t *ppos)
361{
362 struct cgroup *cur_cgroup;
363 struct dev_cgroup *devcgroup, *cur_devcgroup;
364 int filetype = cft->private;
365 char *buffer, *b;
366 int retval = 0, count;
367 struct dev_whitelist_item wh;
368
369 if (!capable(CAP_SYS_ADMIN))
370 return -EPERM;
371
372 devcgroup = cgroup_to_devcgroup(cgroup);
373 cur_cgroup = task_cgroup(current, devices_subsys.subsys_id);
374 cur_devcgroup = cgroup_to_devcgroup(cur_cgroup);
375
376 buffer = kmalloc(nbytes+1, GFP_KERNEL);
377 if (!buffer)
378 return -ENOMEM;
379
380 if (copy_from_user(buffer, userbuf, nbytes)) {
381 retval = -EFAULT;
382 goto out1;
383 }
384 buffer[nbytes] = 0; /* nul-terminate */
385
386 cgroup_lock();
387 if (cgroup_is_removed(cgroup)) {
388 retval = -ENODEV;
389 goto out2;
390 }
391
392 memset(&wh, 0, sizeof(wh));
393 b = buffer;
394
395 switch (*b) {
396 case 'a':
397 wh.type = DEV_ALL;
398 wh.access = ACC_MASK;
399 goto handle;
400 case 'b':
401 wh.type = DEV_BLOCK;
402 break;
403 case 'c':
404 wh.type = DEV_CHAR;
405 break;
406 default:
407 retval = -EINVAL;
408 goto out2;
409 }
410 b++;
411 if (!isspace(*b)) {
412 retval = -EINVAL;
413 goto out2;
414 }
415 b++;
416 if (*b == '*') {
417 wh.major = ~0;
418 b++;
419 } else if (isdigit(*b)) {
420 wh.major = 0;
421 while (isdigit(*b)) {
422 wh.major = wh.major*10+(*b-'0');
423 b++;
424 }
425 } else {
426 retval = -EINVAL;
427 goto out2;
428 }
429 if (*b != ':') {
430 retval = -EINVAL;
431 goto out2;
432 }
433 b++;
434
435 /* read minor */
436 if (*b == '*') {
437 wh.minor = ~0;
438 b++;
439 } else if (isdigit(*b)) {
440 wh.minor = 0;
441 while (isdigit(*b)) {
442 wh.minor = wh.minor*10+(*b-'0');
443 b++;
444 }
445 } else {
446 retval = -EINVAL;
447 goto out2;
448 }
449 if (!isspace(*b)) {
450 retval = -EINVAL;
451 goto out2;
452 }
453 for (b++, count = 0; count < 3; count++, b++) {
454 switch (*b) {
455 case 'r':
456 wh.access |= ACC_READ;
457 break;
458 case 'w':
459 wh.access |= ACC_WRITE;
460 break;
461 case 'm':
462 wh.access |= ACC_MKNOD;
463 break;
464 case '\n':
465 case '\0':
466 count = 3;
467 break;
468 default:
469 retval = -EINVAL;
470 goto out2;
471 }
472 }
473
474handle:
475 retval = 0;
476 switch (filetype) {
477 case DEVCG_ALLOW:
478 if (!parent_has_perm(cgroup, &wh))
479 retval = -EPERM;
480 else
481 retval = dev_whitelist_add(devcgroup, &wh);
482 break;
483 case DEVCG_DENY:
484 dev_whitelist_rm(devcgroup, &wh);
485 break;
486 default:
487 retval = -EINVAL;
488 goto out2;
489 }
490
491 if (retval == 0)
492 retval = nbytes;
493
494out2:
495 cgroup_unlock();
496out1:
497 kfree(buffer);
498 return retval;
499}
500
501static struct cftype dev_cgroup_files[] = {
502 {
503 .name = "allow",
504 .read = devcgroup_access_read,
505 .write = devcgroup_access_write,
506 .private = DEVCG_ALLOW,
507 },
508 {
509 .name = "deny",
510 .write = devcgroup_access_write,
511 .private = DEVCG_DENY,
512 },
513};
514
515static int devcgroup_populate(struct cgroup_subsys *ss,
516 struct cgroup *cgroup)
517{
518 return cgroup_add_files(cgroup, ss, dev_cgroup_files,
519 ARRAY_SIZE(dev_cgroup_files));
520}
521
522struct cgroup_subsys devices_subsys = {
523 .name = "devices",
524 .can_attach = devcgroup_can_attach,
525 .create = devcgroup_create,
526 .destroy = devcgroup_destroy,
527 .populate = devcgroup_populate,
528 .subsys_id = devices_subsys_id,
529};
530
531int devcgroup_inode_permission(struct inode *inode, int mask)
532{
533 struct cgroup *cgroup;
534 struct dev_cgroup *dev_cgroup;
535 struct dev_whitelist_item *wh;
536
537 dev_t device = inode->i_rdev;
538 if (!device)
539 return 0;
540 if (!S_ISBLK(inode->i_mode) && !S_ISCHR(inode->i_mode))
541 return 0;
542 cgroup = task_cgroup(current, devices_subsys.subsys_id);
543 dev_cgroup = cgroup_to_devcgroup(cgroup);
544 if (!dev_cgroup)
545 return 0;
546
547 spin_lock(&dev_cgroup->lock);
548 list_for_each_entry(wh, &dev_cgroup->whitelist, list) {
549 if (wh->type & DEV_ALL)
550 goto acc_check;
551 if ((wh->type & DEV_BLOCK) && !S_ISBLK(inode->i_mode))
552 continue;
553 if ((wh->type & DEV_CHAR) && !S_ISCHR(inode->i_mode))
554 continue;
555 if (wh->major != ~0 && wh->major != imajor(inode))
556 continue;
557 if (wh->minor != ~0 && wh->minor != iminor(inode))
558 continue;
559acc_check:
560 if ((mask & MAY_WRITE) && !(wh->access & ACC_WRITE))
561 continue;
562 if ((mask & MAY_READ) && !(wh->access & ACC_READ))
563 continue;
564 spin_unlock(&dev_cgroup->lock);
565 return 0;
566 }
567 spin_unlock(&dev_cgroup->lock);
568
569 return -EPERM;
570}
571
572int devcgroup_inode_mknod(int mode, dev_t dev)
573{
574 struct cgroup *cgroup;
575 struct dev_cgroup *dev_cgroup;
576 struct dev_whitelist_item *wh;
577
578 cgroup = task_cgroup(current, devices_subsys.subsys_id);
579 dev_cgroup = cgroup_to_devcgroup(cgroup);
580 if (!dev_cgroup)
581 return 0;
582
583 spin_lock(&dev_cgroup->lock);
584 list_for_each_entry(wh, &dev_cgroup->whitelist, list) {
585 if (wh->type & DEV_ALL)
586 goto acc_check;
587 if ((wh->type & DEV_BLOCK) && !S_ISBLK(mode))
588 continue;
589 if ((wh->type & DEV_CHAR) && !S_ISCHR(mode))
590 continue;
591 if (wh->major != ~0 && wh->major != MAJOR(dev))
592 continue;
593 if (wh->minor != ~0 && wh->minor != MINOR(dev))
594 continue;
595acc_check:
596 if (!(wh->access & ACC_MKNOD))
597 continue;
598 spin_unlock(&dev_cgroup->lock);
599 return 0;
600 }
601 spin_unlock(&dev_cgroup->lock);
602 return -EPERM;
603}