blob: e29f1ac24ff05e5c7fcf16b1b36d3c97dee0a903 [file] [log] [blame]
Elly Jonescd7a9042011-07-22 13:56:51 -04001/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Will Drewry32ac9f52011-08-18 21:36:27 -05008#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -04009#include <errno.h>
10#include <grp.h>
11#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050012#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <linux/capability.h>
14#include <linux/securebits.h>
15#include <pwd.h>
16#include <sched.h>
17#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050018#include <stdarg.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040019#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <syscall.h>
23#include <sys/capability.h>
24#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050025#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040026#include <sys/prctl.h>
27#include <sys/wait.h>
28#include <syslog.h>
29#include <unistd.h>
30
31#include "libminijail.h"
Will Drewry32ac9f52011-08-18 21:36:27 -050032#include "libsyscalls.h"
Elly Jonescd7a9042011-07-22 13:56:51 -040033#include "libminijail-private.h"
34
Will Drewry32ac9f52011-08-18 21:36:27 -050035/* Until these are reliably available in linux/prctl.h */
36#ifndef PR_SET_SECCOMP_FILTER
37# define PR_SECCOMP_FILTER_SYSCALL 0
38# define PR_SECCOMP_FILTER_EVENT 1
39# define PR_GET_SECCOMP_FILTER 35
40# define PR_SET_SECCOMP_FILTER 36
41# define PR_CLEAR_SECCOMP_FILTER 37
42#endif
43
Will Drewry32ac9f52011-08-18 21:36:27 -050044#define die(_msg, ...) do { \
45 syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
46 abort(); \
47} while (0)
Elly Jonescd7a9042011-07-22 13:56:51 -040048
Will Drewry32ac9f52011-08-18 21:36:27 -050049#define pdie(_msg, ...) \
50 die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
51
52#define warn(_msg, ...) \
53 syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
Elly Jonescd7a9042011-07-22 13:56:51 -040054
Will Drewryf89aef52011-09-16 16:48:57 -050055struct seccomp_filter {
56 int nr;
57 char *filter;
58 struct seccomp_filter *next, *prev;
59};
60
61struct minijail {
62 struct {
63 int uid : 1;
64 int gid : 1;
65 int caps : 1;
66 int vfs : 1;
67 int pids : 1;
68 int seccomp : 1;
69 int readonly : 1;
70 int usergroups : 1;
71 int ptrace : 1;
72 int seccomp_filter : 1;
73 } flags;
74 uid_t uid;
75 gid_t gid;
76 gid_t usergid;
77 char *user;
78 uint64_t caps;
79 pid_t initpid;
80 int filter_count;
81 struct seccomp_filter *filters;
82};
83
Elly Jonescd7a9042011-07-22 13:56:51 -040084struct minijail *minijail_new(void) {
85 struct minijail *j = malloc(sizeof(*j));
86 if (j)
87 memset(j, 0, sizeof(*j));
88 return j;
89}
90
91void minijail_change_uid(struct minijail *j, uid_t uid) {
92 if (uid == 0)
93 die("useless change to uid 0");
94 j->uid = uid;
95 j->flags.uid = 1;
96}
97
98void minijail_change_gid(struct minijail *j, gid_t gid) {
99 if (gid == 0)
100 die("useless change to gid 0");
101 j->gid = gid;
102 j->flags.gid = 1;
103}
104
105int minijail_change_user(struct minijail *j, const char *user) {
Elly Joneseb300c52011-09-22 14:35:43 -0400106 char *buf = NULL;
107 struct passwd pw;
108 struct passwd *ppw = NULL;
109 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
110 if (sz == -1)
111 sz = 65536; /* your guess is as good as mine... */
112
113 /* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return the
114 * maximum needed size of the buffer, so we don't have to search. */
115 buf = malloc(sz);
116 if (!buf)
117 return -ENOMEM;
118 getpwnam_r(user, &pw, buf, sz, &ppw);
119 free(buf);
120 if (!ppw)
Elly Jonescd7a9042011-07-22 13:56:51 -0400121 return errno;
Elly Joneseb300c52011-09-22 14:35:43 -0400122 minijail_change_uid(j, ppw->pw_uid);
Will Drewry2ddaad02011-09-16 11:36:08 -0500123 j->user = strdup(user);
124 if (!j->user)
125 return -ENOMEM;
Elly Joneseb300c52011-09-22 14:35:43 -0400126 j->usergid = ppw->pw_gid;
Elly Jonescd7a9042011-07-22 13:56:51 -0400127 return 0;
128}
129
130int minijail_change_group(struct minijail *j, const char *group) {
Elly Joneseb300c52011-09-22 14:35:43 -0400131 char *buf = NULL;
132 struct group gr;
133 struct group *pgr = NULL;
134 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
135 if (sz == -1)
136 sz = 65536; /* and mine is as good as yours, really */
137
138 /* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return the
139 * maximum needed size of the buffer, so we don't have to search. */
140 buf = malloc(sz);
141 if (!buf)
142 return -ENOMEM;
143 getgrnam_r(group, &gr, buf, sz, &pgr);
144 free(buf);
145 if (!pgr)
Elly Jonescd7a9042011-07-22 13:56:51 -0400146 return errno;
Elly Joneseb300c52011-09-22 14:35:43 -0400147 minijail_change_gid(j, pgr->gr_gid);
Elly Jonescd7a9042011-07-22 13:56:51 -0400148 return 0;
149}
150
151void minijail_use_seccomp(struct minijail *j) {
152 j->flags.seccomp = 1;
153}
154
Will Drewry32ac9f52011-08-18 21:36:27 -0500155void minijail_use_seccomp_filter(struct minijail *j) {
156 j->flags.seccomp_filter = 1;
157}
158
Elly Jonescd7a9042011-07-22 13:56:51 -0400159void minijail_use_caps(struct minijail *j, uint64_t capmask) {
160 j->caps = capmask;
161 j->flags.caps = 1;
162}
163
164void minijail_namespace_vfs(struct minijail *j) {
165 j->flags.vfs = 1;
166}
167
168void minijail_namespace_pids(struct minijail *j) {
169 j->flags.pids = 1;
170}
171
172void minijail_remount_readonly(struct minijail *j) {
173 j->flags.vfs = 1;
174 j->flags.readonly = 1;
175}
176
177void minijail_inherit_usergroups(struct minijail *j) {
178 j->flags.usergroups = 1;
179}
180
181void minijail_disable_ptrace(struct minijail *j) {
182 j->flags.ptrace = 1;
183}
184
Will Drewry32ac9f52011-08-18 21:36:27 -0500185int minijail_add_seccomp_filter(struct minijail *j, int nr,
186 const char *filter) {
187 struct seccomp_filter *sf;
188 if (!filter || nr < 0)
189 return -EINVAL;
190
191 sf = malloc(sizeof(*sf));
192 if (!sf)
193 return -ENOMEM;
194 sf->nr = nr;
195 sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
196 if (!sf->filter) {
197 free(sf);
198 return -ENOMEM;
199 }
200
Will Drewryf89aef52011-09-16 16:48:57 -0500201 j->filter_count++;
202
Will Drewry32ac9f52011-08-18 21:36:27 -0500203 if (!j->filters) {
204 j->filters = sf;
205 sf->next = sf;
206 sf->prev = sf;
207 return 0;
208 }
209 sf->next = j->filters;
210 sf->prev = j->filters->prev;
211 sf->prev->next = sf;
212 j->filters->prev = sf;
213 return 0;
214}
215
216int minijail_lookup_syscall(const char *name) {
217 const struct syscall_entry *entry = syscall_table;
218 for (; entry->name && entry->nr >= 0; ++entry)
219 if (!strcmp(entry->name, name))
220 return entry->nr;
221 return -1;
222}
223
224static char *strip(char *s) {
225 char *end;
226 while (*s && isblank(*s))
227 s++;
228 end = s + strlen(s) - 1;
229 while (*end && (isblank(*end) || *end == '\n'))
230 end--;
231 *(end+1) = '\0';
232 return s;
233}
234
235void minijail_parse_seccomp_filters(struct minijail *j, const char *path) {
236 FILE *file = fopen(path, "r");
237 char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
238 int count = 1;
239 if (!file)
240 pdie("failed to open seccomp filters file");
241
242 /* Format is simple:
243 * syscall_name<COLON><FILTER STRING>[\n|EOF]
244 * #...comment...
245 * <empty line?
246 */
247 while (fgets(line, sizeof(line), file)) {
248 char *filter = line;
249 char *name = strsep(&filter, ":");
250 char *name_end = NULL;
251 int nr = -1;
252
253 if (!name)
254 die("invalid filter on line %d", count);
255
256 name = strip(name);
257
258 if (!filter) {
259 if (strlen(name))
260 die("invalid filter on line %d", count);
261 /* Allow empty lines */
262 continue;
263 }
264
265 /* Allow comment lines */
266 if (*name == '#')
267 continue;
268
269 filter = strip(filter);
270
271 /* Take direct syscall numbers */
272 nr = strtol(name, &name_end, 0);
273 /* Or fail-over to using names */
274 if (*name_end != '\0')
275 nr = minijail_lookup_syscall(name);
276 if (nr < 0)
277 die("syscall '%s' unknown", name);
278
279 if (minijail_add_seccomp_filter(j, nr, filter))
280 pdie("failed to add filter for syscall '%s'", name);
281 }
282 fclose(file);
283}
284
Will Drewryf89aef52011-09-16 16:48:57 -0500285struct marshal_state {
286 size_t available;
287 size_t total;
288 char *buf;
289};
290
291static void marshal_state_init(struct marshal_state *state,
292 char *buf,
293 size_t available) {
294 state->available = available;
295 state->buf = buf;
296 state->total = 0;
297}
298
299static void marshal_append(struct marshal_state *state,
300 char *src,
301 size_t length) {
302 size_t copy_len = MIN(state->available, length);
303
304 /* Up to |available| will be written. */
305 if (copy_len) {
306 memcpy(state->buf, src, copy_len);
307 state->buf += copy_len;
308 state->available -= copy_len;
309 }
310 /* |total| will contain the expected length. */
311 state->total += length;
312}
313
314static void minijail_marshal_helper(struct marshal_state *state,
315 const struct minijail *j) {
316 marshal_append(state, (char *) j, sizeof(*j));
Will Drewry2ddaad02011-09-16 11:36:08 -0500317 if (j->user)
Will Drewryf89aef52011-09-16 16:48:57 -0500318 marshal_append(state, j->user, strlen(j->user) + 1);
319 if (j->flags.seccomp_filter && j->filters) {
320 struct seccomp_filter *f = j->filters;
321 do {
322 marshal_append(state, (char *) &f->nr, sizeof(f->nr));
323 marshal_append(state, f->filter, strlen(f->filter) + 1);
324 f = f->next;
325 } while (f != j->filters);
326 }
327}
328
329size_t minijail_size(const struct minijail *j) {
330 struct marshal_state state;
331 marshal_state_init(&state, NULL, 0);
332 minijail_marshal_helper(&state, j);
333 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500334}
335
Will Drewry2ddaad02011-09-16 11:36:08 -0500336int minijail_marshal(const struct minijail *j, char *buf, size_t available) {
Will Drewryf89aef52011-09-16 16:48:57 -0500337 struct marshal_state state;
338 marshal_state_init(&state, buf, available);
339 minijail_marshal_helper(&state, j);
340 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500341}
342
343int minijail_unmarshal(struct minijail *j, char *serialized, size_t length) {
344 if (length < sizeof(*j))
345 return -EINVAL;
346 memcpy((void *) j, serialized, sizeof(*j));
347 serialized += sizeof(*j);
348 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500349
Will Drewry2ddaad02011-09-16 11:36:08 -0500350 if (j->user) { /* stale pointer */
351 if (!length)
352 return -EINVAL;
353 j->user = strndup(serialized, length);
354 length -= strlen(j->user) + 1;
Will Drewryf89aef52011-09-16 16:48:57 -0500355 serialized += strlen(j->user) + 1;
356 }
357
358 if (j->flags.seccomp_filter && j->filter_count) {
359 int count = j->filter_count;
360 /* Let add_seccomp_filter recompute the value. */
361 j->filter_count = 0;
362 j->filters = NULL; /* Don't follow the stale pointer. */
363 for ( ; count > 0; --count) {
364 int *nr = (int *) serialized;
365 char *filter;
366 if (length < sizeof(*nr))
367 return -EINVAL;
368 length -= sizeof(*nr);
369 serialized += sizeof(*nr);
370 if (!length)
371 return -EINVAL;
372 filter = serialized;
373 if (minijail_add_seccomp_filter(j, *nr, filter))
374 return -EINVAL;
375 length -= strlen(filter) + 1;
376 serialized += strlen(filter) + 1;
377 }
Will Drewry2ddaad02011-09-16 11:36:08 -0500378 }
379 return 0;
380}
381
Will Drewryfe4a3722011-09-16 14:50:50 -0500382void minijail_preenter(struct minijail *j) {
383 /* Strip out options which are minijail_run() only. */
384 j->flags.vfs = 0;
385 j->flags.readonly = 0;
386 j->flags.pids = 0;
387}
388
389void minijail_preexec(struct minijail *j) {
390 int vfs = j->flags.vfs;
391 int readonly = j->flags.readonly;
Will Drewry2ddaad02011-09-16 11:36:08 -0500392 if (j->user)
393 free(j->user);
394 j->user = NULL;
Will Drewryfe4a3722011-09-16 14:50:50 -0500395 memset(&j->flags, 0, sizeof(j->flags));
396 /* Now restore anything we meant to keep. */
397 j->flags.vfs = vfs;
398 j->flags.readonly = readonly;
399 /* Note, pidns will already have been used before this call. */
Will Drewry2ddaad02011-09-16 11:36:08 -0500400}
401
Elly Jonescd7a9042011-07-22 13:56:51 -0400402static int remount_readonly(void) {
403 const char *kProcPath = "/proc";
404 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
405 /* Right now, we're holding a reference to our parent's old mount of /proc in
406 * our namespace, which means using MS_REMOUNT here would mutate our parent's
407 * mount as well, even though we're in a VFS namespace (!). Instead, remove
408 * their mount from our namespace and make our own. */
409 if (umount(kProcPath))
410 return errno;
411 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
412 return errno;
413 return 0;
414}
415
416static void drop_caps(const struct minijail *j) {
417 cap_t caps = cap_get_proc();
418 cap_value_t raise_flag[1];
419 unsigned int i;
420 if (!caps)
421 die("can't get process caps");
422 if (cap_clear_flag(caps, CAP_INHERITABLE))
423 die("can't clear inheritable caps");
424 if (cap_clear_flag(caps, CAP_EFFECTIVE))
425 die("can't clear effective caps");
426 if (cap_clear_flag(caps, CAP_PERMITTED))
427 die("can't clear permitted caps");
428 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
429 if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
430 continue;
431 raise_flag[0] = i;
432 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
433 die("can't add effective cap");
434 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
435 die("can't add permitted cap");
436 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
437 die("can't add inheritable cap");
438 }
439 if (cap_set_proc(caps))
440 die("can't apply cleaned capset");
441 cap_free(caps);
442 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
443 if (j->caps & (1 << i))
444 continue;
445 if (prctl(PR_CAPBSET_DROP, i))
446 pdie("prctl(PR_CAPBSET_DROP)");
447 }
448}
449
Will Drewry32ac9f52011-08-18 21:36:27 -0500450static int setup_seccomp_filters(const struct minijail *j) {
451 const struct seccomp_filter *sf = j->filters;
452 int ret = 0;
453 int broaden = 0;
454
455 /* No filters installed isn't necessarily an error. */
456 if (!sf)
457 return ret;
458
459 do {
460 errno = 0;
461 ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
462 sf->nr, broaden ? "1" : sf->filter);
463 if (ret) {
464 switch (errno) {
465 case ENOSYS:
466 /* TODO(wad) make this a config option */
467 if (broaden)
468 die("CONFIG_SECCOMP_FILTER is not supported by your kernel");
469 warn("missing CONFIG_FTRACE_SYSCALLS; relaxing the filter for %d",
470 sf->nr);
471 broaden = 1;
472 continue;
473 case E2BIG:
474 warn("seccomp filter too long: %d", sf->nr);
475 pdie("filter too long");
476 case ENOSPC:
477 pdie("too many seccomp filters");
478 case EPERM:
479 warn("syscall filter disallowed for %d", sf->nr);
480 pdie("failed to install seccomp filter");
481 case EINVAL:
482 warn("seccomp filter or call method is invalid. %d:'%s'",
483 sf->nr, sf->filter);
484 default:
485 pdie("failed to install seccomp filter");
486 }
487 }
488 sf = sf->next;
489 broaden = 0;
490 } while (sf != j->filters);
491 return ret;
492}
493
Elly Jonescd7a9042011-07-22 13:56:51 -0400494void minijail_enter(const struct minijail *j) {
495 if (j->flags.pids)
496 die("tried to enter a pid-namespaced jail; try minijail_run()?");
497
Will Drewryf89aef52011-09-16 16:48:57 -0500498 if (j->flags.seccomp_filter && setup_seccomp_filters(j))
499 pdie("failed to configure seccomp filters");
Will Drewry32ac9f52011-08-18 21:36:27 -0500500
Elly Jonescd7a9042011-07-22 13:56:51 -0400501 if (j->flags.usergroups && !j->user)
502 die("usergroup inheritance without username");
503
504 /* We can't recover from failures if we've dropped privileges partially,
505 * so we don't even try. If any of our operations fail, we abort() the
506 * entire process. */
507 if (j->flags.vfs && unshare(CLONE_NEWNS))
508 pdie("unshare");
509
510 if (j->flags.readonly && remount_readonly())
511 pdie("remount");
512
513 if (j->flags.caps) {
514 /* POSIX capabilities are a bit tricky. If we drop our capability to change
515 * uids, our attempt to use setuid() below will fail. Hang on to root caps
516 * across setuid(), then lock securebits. */
517 if (prctl(PR_SET_KEEPCAPS, 1))
518 pdie("prctl(PR_SET_KEEPCAPS)");
519 if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
520 pdie("prctl(PR_SET_SECUREBITS)");
521 }
522
Will Drewryc6c86432011-09-18 14:37:22 -0500523 if (j->flags.usergroups) {
524 if (initgroups(j->user, j->usergid))
525 pdie("initgroups");
526 } else {
527 /* Only attempt to clear supplemental groups if we are changing users. */
528 if ((j->uid || j->gid) && setgroups(0, NULL))
529 pdie("setgroups");
Will Drewry32ac9f52011-08-18 21:36:27 -0500530 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400531
532 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
533 pdie("setresgid");
534
535 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
536 pdie("setresuid");
537
538 if (j->flags.caps)
539 drop_caps(j);
540
541 /* seccomp has to come last since it cuts off all the other
542 * privilege-dropping syscalls :) */
Will Drewry32ac9f52011-08-18 21:36:27 -0500543 if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
544 pdie("prctl(PR_SET_SECCOMP, 13)");
545
Elly Jonescd7a9042011-07-22 13:56:51 -0400546 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
547 pdie("prctl(PR_SET_SECCOMP)");
548}
549
550static int init_exitstatus = 0;
551
552static void init_term(int __attribute__((unused)) sig) {
553 _exit(init_exitstatus);
554}
555
556static int init(pid_t rootpid) {
557 pid_t pid;
558 int status;
559 signal(SIGTERM, init_term); /* so that we exit with the right status */
Will Drewryf89aef52011-09-16 16:48:57 -0500560 /* TODO(wad) self jail with seccomp_filters here. */
Elly Jonescd7a9042011-07-22 13:56:51 -0400561 while ((pid = wait(&status)) > 0) {
562 /* This loop will only end when either there are no processes left inside
563 * our pid namespace or we get a signal. */
564 if (pid == rootpid)
565 init_exitstatus = status;
566 }
567 if (!WIFEXITED(init_exitstatus))
568 _exit(MINIJAIL_ERR_INIT);
569 _exit(WEXITSTATUS(init_exitstatus));
570}
571
Will Drewryfe4a3722011-09-16 14:50:50 -0500572int minijail_from_fd(int fd, struct minijail *j) {
573 size_t sz = 0;
574 size_t bytes = read(fd, &sz, sizeof(sz));
575 char *buf;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500576 int r;
Will Drewryfe4a3722011-09-16 14:50:50 -0500577 if (sizeof(sz) != bytes)
578 return -EINVAL;
579 if (sz > USHRT_MAX) /* Arbitrary sanity check */
Will Drewry2f54b6a2011-09-16 13:45:31 -0500580 return -E2BIG;
Will Drewryfe4a3722011-09-16 14:50:50 -0500581 buf = malloc(sz);
582 if (!buf)
583 return -ENOMEM;
584 bytes = read(fd, buf, sz);
585 if (bytes != sz) {
586 free(buf);
587 return -EINVAL;
588 }
589 r = minijail_unmarshal(j, buf, sz);
590 free(buf);
591 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500592}
593
Will Drewryfe4a3722011-09-16 14:50:50 -0500594int minijail_to_fd(struct minijail *j, int fd) {
595 char *buf;
596 size_t sz = minijail_size(j);
597 ssize_t written;
598 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -0400599
Will Drewryfe4a3722011-09-16 14:50:50 -0500600 if (!sz)
601 return -EINVAL;
602 buf = malloc(sz);
603 if ((r = minijail_marshal(j, buf, sz))) {
604 free(buf);
605 return r;
606 }
607 /* Sends [size][minijail]. */
608 written = write(fd, &sz, sizeof(sz));
609 if (written != sizeof(sz)) {
610 free(buf);
611 return -EFAULT;
612 }
613 written = write(fd, buf, sz);
614 if (written < 0 || (size_t) written != sz) {
615 free(buf);
616 return -EFAULT;
617 }
618 free(buf);
619 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500620}
Elly Jonescd7a9042011-07-22 13:56:51 -0400621
Will Drewry2f54b6a2011-09-16 13:45:31 -0500622static int setup_preload(void) {
623 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
624 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
625 if (!newenv)
Elly Jonescd7a9042011-07-22 13:56:51 -0400626 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -0400627
628 /* Only insert a separating space if we have something to separate... */
629 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH);
630
631 /* setenv() makes a copy of the string we give it */
Ben Chan541c7e52011-08-26 14:55:53 -0700632 setenv(kLdPreloadEnvVar, newenv, 1);
Elly Jonescd7a9042011-07-22 13:56:51 -0400633 free(newenv);
Elly Jonescd7a9042011-07-22 13:56:51 -0400634 return 0;
635}
636
Will Drewryf89aef52011-09-16 16:48:57 -0500637static int setup_pipe(int fds[2]) {
638 int r = pipe(fds);
639 char fd_buf[11];
640 if (r)
641 return r;
642 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
643 if (r <= 0)
644 return -EINVAL;
645 setenv(kFdEnvVar, fd_buf, 1);
646 return 0;
647}
648
Elly Jonescd7a9042011-07-22 13:56:51 -0400649int minijail_run(struct minijail *j, const char *filename, char *const argv[]) {
650 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
Ben Chan541c7e52011-08-26 14:55:53 -0700651 char *oldenv, *oldenv_copy = NULL;
Will Drewryf89aef52011-09-16 16:48:57 -0500652 pid_t child_pid;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500653 int pipe_fds[2];
Will Drewryf89aef52011-09-16 16:48:57 -0500654 int ret;
Ben Chan541c7e52011-08-26 14:55:53 -0700655
656 oldenv = getenv(kLdPreloadEnvVar);
657 if (oldenv) {
658 oldenv_copy = strdup(oldenv);
659 if (!oldenv_copy)
660 return -ENOMEM;
661 }
Will Drewryf89aef52011-09-16 16:48:57 -0500662
663 if (setup_preload())
664 return -EFAULT;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500665
666 /* Before we fork(2) and execve(2) the child process, we need to open
667 * a pipe(2) to send the minijail configuration over.
668 */
Will Drewryf89aef52011-09-16 16:48:57 -0500669 if (setup_pipe(pipe_fds))
670 return -EFAULT;
Elly Jonescd7a9042011-07-22 13:56:51 -0400671
Will Drewryf89aef52011-09-16 16:48:57 -0500672 child_pid = syscall(SYS_clone, pidns | SIGCHLD, NULL);
673 if (child_pid < 0) {
674 free(oldenv_copy);
675 return child_pid;
676 }
677
678 if (child_pid) {
679 /* Restore parent's LD_PRELOAD. */
Ben Chan541c7e52011-08-26 14:55:53 -0700680 if (oldenv_copy) {
681 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
682 free(oldenv_copy);
683 } else {
684 unsetenv(kLdPreloadEnvVar);
685 }
Will Drewry2f54b6a2011-09-16 13:45:31 -0500686 unsetenv(kFdEnvVar);
Will Drewryf89aef52011-09-16 16:48:57 -0500687 j->initpid = child_pid;
688 close(pipe_fds[0]); /* read endpoint */
689 ret = minijail_to_fd(j, pipe_fds[1]);
690 close(pipe_fds[1]); /* write endpoint */
691 if (ret) {
Will Drewry2f54b6a2011-09-16 13:45:31 -0500692 kill(j->initpid, SIGKILL);
693 die("failed to send marshalled minijail");
694 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400695 return 0;
696 }
Ben Chan541c7e52011-08-26 14:55:53 -0700697 free(oldenv_copy);
698
Will Drewryfe4a3722011-09-16 14:50:50 -0500699 /* Drop everything that cannot be inherited across execve. */
700 minijail_preexec(j);
Elly Jonescd7a9042011-07-22 13:56:51 -0400701 /* Jail this process and its descendants... */
702 minijail_enter(j);
703
704 if (pidns) {
705 /* pid namespace: this process will become init inside the new namespace, so
706 * fork off a child to actually run the program (we don't want all programs
707 * we might exec to have to know how to be init). */
Will Drewryf89aef52011-09-16 16:48:57 -0500708 child_pid = fork();
709 if (child_pid < 0)
710 _exit(child_pid);
711 else if (child_pid > 0)
712 init(child_pid); /* never returns */
Elly Jonescd7a9042011-07-22 13:56:51 -0400713 }
714
715 /* If we aren't pid-namespaced:
716 * calling process
717 * -> execve()-ing process
718 * If we are:
719 * calling process
720 * -> init()-ing process
721 * -> execve()-ing process
722 */
723 _exit(execve(filename, argv, environ));
724}
725
726int minijail_kill(struct minijail *j) {
727 int st;
728 if (kill(j->initpid, SIGTERM))
729 return errno;
730 if (waitpid(j->initpid, &st, 0) < 0)
731 return errno;
732 return st;
733}
734
735int minijail_wait(struct minijail *j) {
736 int st;
737 if (waitpid(j->initpid, &st, 0) < 0)
738 return errno;
739 if (!WIFEXITED(st))
740 return MINIJAIL_ERR_JAIL;
741 return WEXITSTATUS(st);
742}
743
744void minijail_destroy(struct minijail *j) {
Will Drewry32ac9f52011-08-18 21:36:27 -0500745 struct seccomp_filter *f = j->filters;
746 /* Unlink the tail and head */
747 if (f)
748 f->prev->next = NULL;
749 while (f) {
750 struct seccomp_filter *next = f->next;
751 free(f->filter);
752 free(f);
753 f = next;
754 }
Will Drewry2ddaad02011-09-16 11:36:08 -0500755 if (j->user)
756 free(j->user);
Elly Jonescd7a9042011-07-22 13:56:51 -0400757 free(j);
758}