blob: 08482e8efdb88ab8fcc2f703cc10e8668f56f865 [file] [log] [blame]
Elly Jonescd7a9042011-07-22 13:56:51 -04001/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Will Drewry32ac9f52011-08-18 21:36:27 -05008#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -04009#include <errno.h>
10#include <grp.h>
11#include <inttypes.h>
12#include <linux/capability.h>
13#include <linux/securebits.h>
14#include <pwd.h>
15#include <sched.h>
16#include <signal.h>
17#include <stdio.h>
18#include <stdlib.h>
19#include <string.h>
20#include <syscall.h>
21#include <sys/capability.h>
22#include <sys/mount.h>
23#include <sys/prctl.h>
24#include <sys/wait.h>
25#include <syslog.h>
26#include <unistd.h>
27
28#include "libminijail.h"
Will Drewry32ac9f52011-08-18 21:36:27 -050029#include "libsyscalls.h"
Elly Jonescd7a9042011-07-22 13:56:51 -040030#include "libminijail-private.h"
31
Will Drewry32ac9f52011-08-18 21:36:27 -050032/* Until these are reliably available in linux/prctl.h */
33#ifndef PR_SET_SECCOMP_FILTER
34# define PR_SECCOMP_FILTER_SYSCALL 0
35# define PR_SECCOMP_FILTER_EVENT 1
36# define PR_GET_SECCOMP_FILTER 35
37# define PR_SET_SECCOMP_FILTER 36
38# define PR_CLEAR_SECCOMP_FILTER 37
39#endif
40
41struct seccomp_filter {
42 int nr;
43 char *filter;
44 struct seccomp_filter *next, *prev;
45};
46
Elly Jonescd7a9042011-07-22 13:56:51 -040047struct minijail {
48 struct {
49 int uid : 1;
50 int gid : 1;
51 int caps : 1;
52 int vfs : 1;
53 int pids : 1;
54 int seccomp : 1;
55 int readonly : 1;
56 int usergroups : 1;
57 int ptrace : 1;
Will Drewry32ac9f52011-08-18 21:36:27 -050058 int seccomp_filter : 1;
Elly Jonescd7a9042011-07-22 13:56:51 -040059 } flags;
60 uid_t uid;
61 gid_t gid;
62 gid_t usergid;
63 const char *user;
64 uint64_t caps;
65 pid_t initpid;
Will Drewry32ac9f52011-08-18 21:36:27 -050066 struct seccomp_filter *filters;
Elly Jonescd7a9042011-07-22 13:56:51 -040067};
68
Will Drewry32ac9f52011-08-18 21:36:27 -050069#define die(_msg, ...) do { \
70 syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
71 abort(); \
72} while (0)
Elly Jonescd7a9042011-07-22 13:56:51 -040073
Will Drewry32ac9f52011-08-18 21:36:27 -050074#define pdie(_msg, ...) \
75 die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
76
77#define warn(_msg, ...) \
78 syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
Elly Jonescd7a9042011-07-22 13:56:51 -040079
80struct minijail *minijail_new(void) {
81 struct minijail *j = malloc(sizeof(*j));
82 if (j)
83 memset(j, 0, sizeof(*j));
84 return j;
85}
86
87void minijail_change_uid(struct minijail *j, uid_t uid) {
88 if (uid == 0)
89 die("useless change to uid 0");
90 j->uid = uid;
91 j->flags.uid = 1;
92}
93
94void minijail_change_gid(struct minijail *j, gid_t gid) {
95 if (gid == 0)
96 die("useless change to gid 0");
97 j->gid = gid;
98 j->flags.gid = 1;
99}
100
101int minijail_change_user(struct minijail *j, const char *user) {
102 /* In principle this should use getpwnam(), but:
103 * 1) getpwnam_r() isn't actually reentrant anyway, since it uses a
104 * statically-allocated file descriptor internally
105 * 2) fgetpwnam() (by analogy with fgetpwent) would solve (1) except that it
106 * doesn't exist
107 * 3) sysconf() (see getpwnam_r(3)) is allowed to return a size that is not
108 * large enough, which means having to loop on growing the buffer we pass
109 * in
110 */
111 struct passwd *pw = getpwnam(user);
112 if (!pw)
113 return errno;
114 minijail_change_uid(j, pw->pw_uid);
115 j->user = user;
116 j->usergid = pw->pw_gid;
117 return 0;
118}
119
120int minijail_change_group(struct minijail *j, const char *group) {
121 /* In principle this should use getgrnam(), but:
122 * 1) getgrnam_r() isn't actually reentrant anyway, since it uses a
123 * statically-allocated file descriptor internally
124 * 2) fgetgrnam() (by analogy with fgetgrent) would solve (1) except that it
125 * doesn't exist
126 * 3) sysconf() (see getgrnam_r(3)) is allowed to return a size that is not
127 * large enough, which means having to loop on growing the buffer we pass
128 * in
129 */
130 struct group *gr = getgrnam(group);
131 if (!gr)
132 return errno;
133 minijail_change_gid(j, gr->gr_gid);
134 return 0;
135}
136
137void minijail_use_seccomp(struct minijail *j) {
138 j->flags.seccomp = 1;
139}
140
Will Drewry32ac9f52011-08-18 21:36:27 -0500141void minijail_use_seccomp_filter(struct minijail *j) {
142 j->flags.seccomp_filter = 1;
143}
144
Elly Jonescd7a9042011-07-22 13:56:51 -0400145void minijail_use_caps(struct minijail *j, uint64_t capmask) {
146 j->caps = capmask;
147 j->flags.caps = 1;
148}
149
150void minijail_namespace_vfs(struct minijail *j) {
151 j->flags.vfs = 1;
152}
153
154void minijail_namespace_pids(struct minijail *j) {
155 j->flags.pids = 1;
156}
157
158void minijail_remount_readonly(struct minijail *j) {
159 j->flags.vfs = 1;
160 j->flags.readonly = 1;
161}
162
163void minijail_inherit_usergroups(struct minijail *j) {
164 j->flags.usergroups = 1;
165}
166
167void minijail_disable_ptrace(struct minijail *j) {
168 j->flags.ptrace = 1;
169}
170
Will Drewry32ac9f52011-08-18 21:36:27 -0500171int minijail_add_seccomp_filter(struct minijail *j, int nr,
172 const char *filter) {
173 struct seccomp_filter *sf;
174 if (!filter || nr < 0)
175 return -EINVAL;
176
177 sf = malloc(sizeof(*sf));
178 if (!sf)
179 return -ENOMEM;
180 sf->nr = nr;
181 sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
182 if (!sf->filter) {
183 free(sf);
184 return -ENOMEM;
185 }
186
187 if (!j->filters) {
188 j->filters = sf;
189 sf->next = sf;
190 sf->prev = sf;
191 return 0;
192 }
193 sf->next = j->filters;
194 sf->prev = j->filters->prev;
195 sf->prev->next = sf;
196 j->filters->prev = sf;
197 return 0;
198}
199
200int minijail_lookup_syscall(const char *name) {
201 const struct syscall_entry *entry = syscall_table;
202 for (; entry->name && entry->nr >= 0; ++entry)
203 if (!strcmp(entry->name, name))
204 return entry->nr;
205 return -1;
206}
207
208static char *strip(char *s) {
209 char *end;
210 while (*s && isblank(*s))
211 s++;
212 end = s + strlen(s) - 1;
213 while (*end && (isblank(*end) || *end == '\n'))
214 end--;
215 *(end+1) = '\0';
216 return s;
217}
218
219void minijail_parse_seccomp_filters(struct minijail *j, const char *path) {
220 FILE *file = fopen(path, "r");
221 char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
222 int count = 1;
223 if (!file)
224 pdie("failed to open seccomp filters file");
225
226 /* Format is simple:
227 * syscall_name<COLON><FILTER STRING>[\n|EOF]
228 * #...comment...
229 * <empty line?
230 */
231 while (fgets(line, sizeof(line), file)) {
232 char *filter = line;
233 char *name = strsep(&filter, ":");
234 char *name_end = NULL;
235 int nr = -1;
236
237 if (!name)
238 die("invalid filter on line %d", count);
239
240 name = strip(name);
241
242 if (!filter) {
243 if (strlen(name))
244 die("invalid filter on line %d", count);
245 /* Allow empty lines */
246 continue;
247 }
248
249 /* Allow comment lines */
250 if (*name == '#')
251 continue;
252
253 filter = strip(filter);
254
255 /* Take direct syscall numbers */
256 nr = strtol(name, &name_end, 0);
257 /* Or fail-over to using names */
258 if (*name_end != '\0')
259 nr = minijail_lookup_syscall(name);
260 if (nr < 0)
261 die("syscall '%s' unknown", name);
262
263 if (minijail_add_seccomp_filter(j, nr, filter))
264 pdie("failed to add filter for syscall '%s'", name);
265 }
266 fclose(file);
267}
268
Elly Jonescd7a9042011-07-22 13:56:51 -0400269static int remount_readonly(void) {
270 const char *kProcPath = "/proc";
271 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
272 /* Right now, we're holding a reference to our parent's old mount of /proc in
273 * our namespace, which means using MS_REMOUNT here would mutate our parent's
274 * mount as well, even though we're in a VFS namespace (!). Instead, remove
275 * their mount from our namespace and make our own. */
276 if (umount(kProcPath))
277 return errno;
278 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
279 return errno;
280 return 0;
281}
282
283static void drop_caps(const struct minijail *j) {
284 cap_t caps = cap_get_proc();
285 cap_value_t raise_flag[1];
286 unsigned int i;
287 if (!caps)
288 die("can't get process caps");
289 if (cap_clear_flag(caps, CAP_INHERITABLE))
290 die("can't clear inheritable caps");
291 if (cap_clear_flag(caps, CAP_EFFECTIVE))
292 die("can't clear effective caps");
293 if (cap_clear_flag(caps, CAP_PERMITTED))
294 die("can't clear permitted caps");
295 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
296 if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
297 continue;
298 raise_flag[0] = i;
299 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
300 die("can't add effective cap");
301 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
302 die("can't add permitted cap");
303 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
304 die("can't add inheritable cap");
305 }
306 if (cap_set_proc(caps))
307 die("can't apply cleaned capset");
308 cap_free(caps);
309 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
310 if (j->caps & (1 << i))
311 continue;
312 if (prctl(PR_CAPBSET_DROP, i))
313 pdie("prctl(PR_CAPBSET_DROP)");
314 }
315}
316
Will Drewry32ac9f52011-08-18 21:36:27 -0500317static int setup_seccomp_filters(const struct minijail *j) {
318 const struct seccomp_filter *sf = j->filters;
319 int ret = 0;
320 int broaden = 0;
321
322 /* No filters installed isn't necessarily an error. */
323 if (!sf)
324 return ret;
325
326 do {
327 errno = 0;
328 ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
329 sf->nr, broaden ? "1" : sf->filter);
330 if (ret) {
331 switch (errno) {
332 case ENOSYS:
333 /* TODO(wad) make this a config option */
334 if (broaden)
335 die("CONFIG_SECCOMP_FILTER is not supported by your kernel");
336 warn("missing CONFIG_FTRACE_SYSCALLS; relaxing the filter for %d",
337 sf->nr);
338 broaden = 1;
339 continue;
340 case E2BIG:
341 warn("seccomp filter too long: %d", sf->nr);
342 pdie("filter too long");
343 case ENOSPC:
344 pdie("too many seccomp filters");
345 case EPERM:
346 warn("syscall filter disallowed for %d", sf->nr);
347 pdie("failed to install seccomp filter");
348 case EINVAL:
349 warn("seccomp filter or call method is invalid. %d:'%s'",
350 sf->nr, sf->filter);
351 default:
352 pdie("failed to install seccomp filter");
353 }
354 }
355 sf = sf->next;
356 broaden = 0;
357 } while (sf != j->filters);
358 return ret;
359}
360
Elly Jonescd7a9042011-07-22 13:56:51 -0400361void minijail_enter(const struct minijail *j) {
Will Drewry32ac9f52011-08-18 21:36:27 -0500362 int ret;
Elly Jonescd7a9042011-07-22 13:56:51 -0400363 if (j->flags.pids)
364 die("tried to enter a pid-namespaced jail; try minijail_run()?");
365
Will Drewry32ac9f52011-08-18 21:36:27 -0500366 ret = setup_seccomp_filters(j);
367 if (j->flags.seccomp_filter && ret)
368 die("failed to configure seccomp filters");
369
Elly Jonescd7a9042011-07-22 13:56:51 -0400370 if (j->flags.usergroups && !j->user)
371 die("usergroup inheritance without username");
372
373 /* We can't recover from failures if we've dropped privileges partially,
374 * so we don't even try. If any of our operations fail, we abort() the
375 * entire process. */
376 if (j->flags.vfs && unshare(CLONE_NEWNS))
377 pdie("unshare");
378
379 if (j->flags.readonly && remount_readonly())
380 pdie("remount");
381
382 if (j->flags.caps) {
383 /* POSIX capabilities are a bit tricky. If we drop our capability to change
384 * uids, our attempt to use setuid() below will fail. Hang on to root caps
385 * across setuid(), then lock securebits. */
386 if (prctl(PR_SET_KEEPCAPS, 1))
387 pdie("prctl(PR_SET_KEEPCAPS)");
388 if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
389 pdie("prctl(PR_SET_SECUREBITS)");
390 }
391
Will Drewry32ac9f52011-08-18 21:36:27 -0500392 if (j->flags.usergroups && initgroups(j->user, j->usergid)) {
Elly Jonescd7a9042011-07-22 13:56:51 -0400393 pdie("initgroups");
Will Drewry32ac9f52011-08-18 21:36:27 -0500394 } else if (!j->flags.usergroups && setgroups(0, NULL)) {
Elly Jonescd7a9042011-07-22 13:56:51 -0400395 pdie("setgroups");
Will Drewry32ac9f52011-08-18 21:36:27 -0500396 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400397
398 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
399 pdie("setresgid");
400
401 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
402 pdie("setresuid");
403
404 if (j->flags.caps)
405 drop_caps(j);
406
407 /* seccomp has to come last since it cuts off all the other
408 * privilege-dropping syscalls :) */
Will Drewry32ac9f52011-08-18 21:36:27 -0500409 if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
410 pdie("prctl(PR_SET_SECCOMP, 13)");
411
Elly Jonescd7a9042011-07-22 13:56:51 -0400412 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
413 pdie("prctl(PR_SET_SECCOMP)");
414}
415
416static int init_exitstatus = 0;
417
418static void init_term(int __attribute__((unused)) sig) {
419 _exit(init_exitstatus);
420}
421
422static int init(pid_t rootpid) {
423 pid_t pid;
424 int status;
425 signal(SIGTERM, init_term); /* so that we exit with the right status */
426 while ((pid = wait(&status)) > 0) {
427 /* This loop will only end when either there are no processes left inside
428 * our pid namespace or we get a signal. */
429 if (pid == rootpid)
430 init_exitstatus = status;
431 }
432 if (!WIFEXITED(init_exitstatus))
433 _exit(MINIJAIL_ERR_INIT);
434 _exit(WEXITSTATUS(init_exitstatus));
435}
436
437/** @brief Move any commands that need to be done post-exec into an environment
438 * variable
439 * @param j Jail to move commands from.
440 *
441 * Serializes post-exec() commands into a string, removes them from the jail,
442 * and adds them to the environment; they will be deserialized later (see
443 * __minijail_preloaded) and executed inside the execve()'d process.
444 */
445static int move_commands_to_env(struct minijail *j) {
446 const int kEnvBufSize = 256;
447 const char *ptrace = j->flags.ptrace ? "ptrace " : "";
448 const char *seccomp = j->flags.seccomp ? "seccomp " : "";
449 char setuid[64] = "";
450 char caps[32] = "";
451 char *newenv;
452 char *oldenv;
453 char *envbuf = malloc(kEnvBufSize);
454 int r;
455
456 if (!envbuf)
457 return -ENOMEM;
458
459 if (j->flags.caps)
460 snprintf(caps, sizeof(caps), "caps=%" PRIx64 " ", j->caps);
461
462 if (j->flags.uid && j->flags.caps) {
463 snprintf(setuid, sizeof(setuid), "uid=%d ", j->uid);
464 j->flags.uid = 0;
465 }
466
467 j->flags.caps = 0;
468 j->flags.ptrace = 0;
469 j->flags.seccomp = 0;
470
Will Drewry32ac9f52011-08-18 21:36:27 -0500471 if (j->flags.seccomp_filter)
472 warn("TODO(wad) seccomp_filter is installed in the parent which "
473 "requires overly permissive rules for execve(2)ing.");
474
Elly Jonescd7a9042011-07-22 13:56:51 -0400475 r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
476 if (!r) {
477 /* No commands generated, so no preload needed :) */
478 free(envbuf);
479 return 0;
480 }
481 if (r == kEnvBufSize) {
482 free(envbuf);
483 return -E2BIG;
484 }
485
Ben Chan541c7e52011-08-26 14:55:53 -0700486 oldenv = getenv(kLdPreloadEnvVar) ? : "";
Elly Jonescd7a9042011-07-22 13:56:51 -0400487 newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
488 if (!newenv) {
489 free(envbuf);
490 return -ENOMEM;
491 }
492
493 /* Only insert a separating space if we have something to separate... */
494 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH);
495
496 /* setenv() makes a copy of the string we give it */
Ben Chan541c7e52011-08-26 14:55:53 -0700497 setenv(kLdPreloadEnvVar, newenv, 1);
Elly Jonescd7a9042011-07-22 13:56:51 -0400498 setenv(kCommandEnvVar, envbuf, 1);
499 free(newenv);
500 free(envbuf);
501 return 0;
502}
503
504int minijail_run(struct minijail *j, const char *filename, char *const argv[]) {
505 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
Ben Chan541c7e52011-08-26 14:55:53 -0700506 char *oldenv, *oldenv_copy = NULL;
Elly Jonescd7a9042011-07-22 13:56:51 -0400507 pid_t r;
Ben Chan541c7e52011-08-26 14:55:53 -0700508
509 oldenv = getenv(kLdPreloadEnvVar);
510 if (oldenv) {
511 oldenv_copy = strdup(oldenv);
512 if (!oldenv_copy)
513 return -ENOMEM;
514 }
515
Elly Jonescd7a9042011-07-22 13:56:51 -0400516 r = move_commands_to_env(j);
Ben Chan541c7e52011-08-26 14:55:53 -0700517 if (r) {
518 /* No environment variable is modified if move_commands_to_env returns
519 * a non-zero value. */
520 free(oldenv_copy);
Elly Jonescd7a9042011-07-22 13:56:51 -0400521 return r;
Ben Chan541c7e52011-08-26 14:55:53 -0700522 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400523
524 r = syscall(SYS_clone, pidns | SIGCHLD, NULL);
525 if (r > 0) {
Ben Chan541c7e52011-08-26 14:55:53 -0700526 if (oldenv_copy) {
527 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
528 free(oldenv_copy);
529 } else {
530 unsetenv(kLdPreloadEnvVar);
531 }
532 unsetenv(kCommandEnvVar);
Elly Jonescd7a9042011-07-22 13:56:51 -0400533 j->initpid = r;
534 return 0;
535 }
Ben Chan541c7e52011-08-26 14:55:53 -0700536
537 free(oldenv_copy);
538
Elly Jonescd7a9042011-07-22 13:56:51 -0400539 if (r < 0)
540 return r;
541
542 j->flags.pids = 0;
543
544 /* Jail this process and its descendants... */
545 minijail_enter(j);
546
547 if (pidns) {
548 /* pid namespace: this process will become init inside the new namespace, so
549 * fork off a child to actually run the program (we don't want all programs
550 * we might exec to have to know how to be init). */
551 r = fork();
552 if (r < 0)
553 _exit(r);
554 else if (r > 0)
555 init(r); /* never returns */
556 }
557
558 /* If we aren't pid-namespaced:
559 * calling process
560 * -> execve()-ing process
561 * If we are:
562 * calling process
563 * -> init()-ing process
564 * -> execve()-ing process
565 */
566 _exit(execve(filename, argv, environ));
567}
568
569int minijail_kill(struct minijail *j) {
570 int st;
571 if (kill(j->initpid, SIGTERM))
572 return errno;
573 if (waitpid(j->initpid, &st, 0) < 0)
574 return errno;
575 return st;
576}
577
578int minijail_wait(struct minijail *j) {
579 int st;
580 if (waitpid(j->initpid, &st, 0) < 0)
581 return errno;
582 if (!WIFEXITED(st))
583 return MINIJAIL_ERR_JAIL;
584 return WEXITSTATUS(st);
585}
586
587void minijail_destroy(struct minijail *j) {
Will Drewry32ac9f52011-08-18 21:36:27 -0500588 struct seccomp_filter *f = j->filters;
589 /* Unlink the tail and head */
590 if (f)
591 f->prev->next = NULL;
592 while (f) {
593 struct seccomp_filter *next = f->next;
594 free(f->filter);
595 free(f);
596 f = next;
597 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400598 free(j);
599}