blob: 858526561b19add73f0143eb017bdd63cb2d12d9 [file] [log] [blame]
Elly Jonesdd3e8512012-01-23 15:13:38 -05001/*
2 * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04003 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05004 * found in the LICENSE file.
5 */
Elly Jonescd7a9042011-07-22 13:56:51 -04006
7#define _BSD_SOURCE
8#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07009
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080010#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050011#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040012#include <errno.h>
13#include <grp.h>
14#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050015#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050020#include <stdarg.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080021#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040022#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <syscall.h>
26#include <sys/capability.h>
27#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050028#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040029#include <sys/prctl.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080030#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040031#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040032#include <unistd.h>
33
34#include "libminijail.h"
35#include "libminijail-private.h"
36
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070037#include "signal.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080038#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070039#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080040
Lei Zhangeee31552012-10-17 21:27:10 -070041#ifdef HAVE_SECUREBITS_H
42#include <linux/securebits.h>
43#else
44#define SECURE_ALL_BITS 0x15
45#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
46#endif
47
Will Drewry32ac9f52011-08-18 21:36:27 -050048/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080049#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070050# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080051#endif
52
53/* For seccomp_filter using BPF. */
54#ifndef PR_SET_NO_NEW_PRIVS
55# define PR_SET_NO_NEW_PRIVS 38
56#endif
57#ifndef SECCOMP_MODE_FILTER
58# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050059#endif
60
Elly Jones51a5b6c2011-10-12 19:09:26 -040061struct binding {
62 char *src;
63 char *dest;
64 int writeable;
65 struct binding *next;
66};
67
Will Drewryf89aef52011-09-16 16:48:57 -050068struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070069 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070070 * WARNING: if you add a flag here you need to make sure it's
71 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070072 */
Elly Jonese1749eb2011-10-07 13:54:59 -040073 struct {
74 int uid:1;
75 int gid:1;
76 int caps:1;
77 int vfs:1;
78 int pids:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -040079 int net:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040080 int seccomp:1;
81 int readonly:1;
82 int usergroups:1;
83 int ptrace:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070084 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040085 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070086 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -040087 int chroot:1;
Lee Campbell11af0622014-05-22 12:36:04 -070088 int mount_tmp:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040089 } flags;
90 uid_t uid;
91 gid_t gid;
92 gid_t usergid;
93 char *user;
94 uint64_t caps;
95 pid_t initpid;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080096 int filter_len;
Elly Jones51a5b6c2011-10-12 19:09:26 -040097 int binding_count;
98 char *chrootdir;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080099 struct sock_fprog *filter_prog;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400100 struct binding *bindings_head;
101 struct binding *bindings_tail;
Will Drewryf89aef52011-09-16 16:48:57 -0500102};
103
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700104/*
105 * Strip out flags meant for the parent.
106 * We keep things that are not inherited across execve(2) (e.g. capabilities),
107 * or are easier to set after execve(2) (e.g. seccomp filters).
108 */
109void minijail_preenter(struct minijail *j)
110{
111 j->flags.vfs = 0;
112 j->flags.readonly = 0;
113 j->flags.pids = 0;
114}
115
116/*
117 * Strip out flags meant for the child.
118 * We keep things that are inherited across execve(2).
119 */
120void minijail_preexec(struct minijail *j)
121{
122 int vfs = j->flags.vfs;
123 int readonly = j->flags.readonly;
124 if (j->user)
125 free(j->user);
126 j->user = NULL;
127 memset(&j->flags, 0, sizeof(j->flags));
128 /* Now restore anything we meant to keep. */
129 j->flags.vfs = vfs;
130 j->flags.readonly = readonly;
131 /* Note, |pids| will already have been used before this call. */
132}
133
134/* Minijail API. */
135
Will Drewry6ac91122011-10-21 16:38:58 -0500136struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400137{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400138 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400139}
140
Will Drewry6ac91122011-10-21 16:38:58 -0500141void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400142{
143 if (uid == 0)
144 die("useless change to uid 0");
145 j->uid = uid;
146 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400147}
148
Will Drewry6ac91122011-10-21 16:38:58 -0500149void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400150{
151 if (gid == 0)
152 die("useless change to gid 0");
153 j->gid = gid;
154 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400155}
156
Will Drewry6ac91122011-10-21 16:38:58 -0500157int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400158{
159 char *buf = NULL;
160 struct passwd pw;
161 struct passwd *ppw = NULL;
162 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
163 if (sz == -1)
164 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400165
Elly Jonesdd3e8512012-01-23 15:13:38 -0500166 /*
167 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400168 * the maximum needed size of the buffer, so we don't have to search.
169 */
170 buf = malloc(sz);
171 if (!buf)
172 return -ENOMEM;
173 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500174 /*
175 * We're safe to free the buffer here. The strings inside pw point
176 * inside buf, but we don't use any of them; this leaves the pointers
177 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
178 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400179 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700180 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400181 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700182 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400183 minijail_change_uid(j, ppw->pw_uid);
184 j->user = strdup(user);
185 if (!j->user)
186 return -ENOMEM;
187 j->usergid = ppw->pw_gid;
188 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400189}
190
Will Drewry6ac91122011-10-21 16:38:58 -0500191int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400192{
193 char *buf = NULL;
194 struct group gr;
195 struct group *pgr = NULL;
196 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
197 if (sz == -1)
198 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400199
Elly Jonesdd3e8512012-01-23 15:13:38 -0500200 /*
201 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400202 * the maximum needed size of the buffer, so we don't have to search.
203 */
204 buf = malloc(sz);
205 if (!buf)
206 return -ENOMEM;
207 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500208 /*
209 * We're safe to free the buffer here. The strings inside gr point
210 * inside buf, but we don't use any of them; this leaves the pointers
211 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
212 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400213 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700214 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400215 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700216 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400217 minijail_change_gid(j, pgr->gr_gid);
218 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400219}
220
Will Drewry6ac91122011-10-21 16:38:58 -0500221void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400222{
223 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400224}
225
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700226void API minijail_no_new_privs(struct minijail *j)
227{
228 j->flags.no_new_privs = 1;
229}
230
Will Drewry6ac91122011-10-21 16:38:58 -0500231void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400232{
233 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500234}
235
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700236void API minijail_log_seccomp_filter_failures(struct minijail *j)
237{
238 j->flags.log_seccomp_filter = 1;
239}
240
Will Drewry6ac91122011-10-21 16:38:58 -0500241void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400242{
243 j->caps = capmask;
244 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400245}
246
Will Drewry6ac91122011-10-21 16:38:58 -0500247void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400248{
249 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400250}
251
Will Drewry6ac91122011-10-21 16:38:58 -0500252void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400253{
Elly Jonese58176c2012-01-23 11:46:17 -0500254 j->flags.vfs = 1;
255 j->flags.readonly = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400256 j->flags.pids = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400257}
258
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400259void API minijail_namespace_net(struct minijail *j)
260{
261 j->flags.net = 1;
262}
263
Will Drewry6ac91122011-10-21 16:38:58 -0500264void API minijail_remount_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400265{
266 j->flags.vfs = 1;
267 j->flags.readonly = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400268}
269
Will Drewry6ac91122011-10-21 16:38:58 -0500270void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400271{
272 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400273}
274
Will Drewry6ac91122011-10-21 16:38:58 -0500275void API minijail_disable_ptrace(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400276{
277 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400278}
279
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700280int API minijail_enter_chroot(struct minijail *j, const char *dir)
281{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400282 if (j->chrootdir)
283 return -EINVAL;
284 j->chrootdir = strdup(dir);
285 if (!j->chrootdir)
286 return -ENOMEM;
287 j->flags.chroot = 1;
288 return 0;
289}
290
Lee Campbell11af0622014-05-22 12:36:04 -0700291void API minijail_mount_tmp(struct minijail *j)
292{
293 j->flags.mount_tmp = 1;
294}
295
Will Drewry6ac91122011-10-21 16:38:58 -0500296int API minijail_bind(struct minijail *j, const char *src, const char *dest,
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700297 int writeable)
298{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400299 struct binding *b;
300
301 if (*dest != '/')
302 return -EINVAL;
303 b = calloc(1, sizeof(*b));
304 if (!b)
305 return -ENOMEM;
306 b->dest = strdup(dest);
307 if (!b->dest)
308 goto error;
309 b->src = strdup(src);
310 if (!b->src)
311 goto error;
312 b->writeable = writeable;
313
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700314 info("bind %s -> %s", src, dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400315
Elly Jonesdd3e8512012-01-23 15:13:38 -0500316 /*
317 * Force vfs namespacing so the bind mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400318 * containing vfs namespace.
319 */
320 minijail_namespace_vfs(j);
321
322 if (j->bindings_tail)
323 j->bindings_tail->next = b;
324 else
325 j->bindings_head = b;
326 j->bindings_tail = b;
327 j->binding_count++;
328
329 return 0;
330
331error:
332 free(b->src);
333 free(b->dest);
334 free(b);
335 return -ENOMEM;
336}
337
Will Drewry6ac91122011-10-21 16:38:58 -0500338void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400339{
340 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800341 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700342 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400343 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800344
345 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700346 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
347 die("failed to compile seccomp filter BPF program in '%s'",
348 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800349 }
350
351 j->filter_len = fprog->len;
352 j->filter_prog = fprog;
353
Elly Jonese1749eb2011-10-07 13:54:59 -0400354 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500355}
356
Will Drewryf89aef52011-09-16 16:48:57 -0500357struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400358 size_t available;
359 size_t total;
360 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500361};
362
Will Drewry6ac91122011-10-21 16:38:58 -0500363void marshal_state_init(struct marshal_state *state,
364 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400365{
366 state->available = available;
367 state->buf = buf;
368 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500369}
370
Will Drewry6ac91122011-10-21 16:38:58 -0500371void marshal_append(struct marshal_state *state,
372 char *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400373{
374 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500375
Elly Jonese1749eb2011-10-07 13:54:59 -0400376 /* Up to |available| will be written. */
377 if (copy_len) {
378 memcpy(state->buf, src, copy_len);
379 state->buf += copy_len;
380 state->available -= copy_len;
381 }
382 /* |total| will contain the expected length. */
383 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500384}
385
Will Drewry6ac91122011-10-21 16:38:58 -0500386void minijail_marshal_helper(struct marshal_state *state,
387 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400388{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400389 struct binding *b = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400390 marshal_append(state, (char *)j, sizeof(*j));
391 if (j->user)
392 marshal_append(state, j->user, strlen(j->user) + 1);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400393 if (j->chrootdir)
394 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800395 if (j->flags.seccomp_filter && j->filter_prog) {
396 struct sock_fprog *fp = j->filter_prog;
397 marshal_append(state, (char *)fp->filter,
398 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400399 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400400 for (b = j->bindings_head; b; b = b->next) {
401 marshal_append(state, b->src, strlen(b->src) + 1);
402 marshal_append(state, b->dest, strlen(b->dest) + 1);
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700403 marshal_append(state, (char *)&b->writeable,
404 sizeof(b->writeable));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400405 }
Will Drewryf89aef52011-09-16 16:48:57 -0500406}
407
Will Drewry6ac91122011-10-21 16:38:58 -0500408size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400409{
410 struct marshal_state state;
411 marshal_state_init(&state, NULL, 0);
412 minijail_marshal_helper(&state, j);
413 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500414}
415
Elly Jonese1749eb2011-10-07 13:54:59 -0400416int minijail_marshal(const struct minijail *j, char *buf, size_t available)
417{
418 struct marshal_state state;
419 marshal_state_init(&state, buf, available);
420 minijail_marshal_helper(&state, j);
421 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500422}
423
Elly Jones51a5b6c2011-10-12 19:09:26 -0400424/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
425 * @length Number of bytes to consume
426 * @buf Buffer to consume from
427 * @buflength Size of @buf
428 *
429 * Returns a pointer to the base of the bytes, or NULL for errors.
430 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700431void *consumebytes(size_t length, char **buf, size_t *buflength)
432{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400433 char *p = *buf;
434 if (length > *buflength)
435 return NULL;
436 *buf += length;
437 *buflength -= length;
438 return p;
439}
440
441/* consumestr: consumes a C string from a buffer @buf of length @length
442 * @buf Buffer to consume
443 * @length Length of buffer
444 *
445 * Returns a pointer to the base of the string, or NULL for errors.
446 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700447char *consumestr(char **buf, size_t *buflength)
448{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400449 size_t len = strnlen(*buf, *buflength);
450 if (len == *buflength)
451 /* There's no null-terminator */
452 return NULL;
453 return consumebytes(len + 1, buf, buflength);
454}
455
Elly Jonese1749eb2011-10-07 13:54:59 -0400456int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
457{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400458 int i;
459 int count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500460 int ret = -EINVAL;
461
Elly Jonese1749eb2011-10-07 13:54:59 -0400462 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500463 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400464 memcpy((void *)j, serialized, sizeof(*j));
465 serialized += sizeof(*j);
466 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500467
Will Drewrybee7ba72011-10-21 20:47:01 -0500468 /* Potentially stale pointers not used as signals. */
469 j->bindings_head = NULL;
470 j->bindings_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800471 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500472
Elly Jonese1749eb2011-10-07 13:54:59 -0400473 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400474 char *user = consumestr(&serialized, &length);
475 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500476 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400477 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500478 if (!j->user)
479 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400480 }
Will Drewryf89aef52011-09-16 16:48:57 -0500481
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400482 if (j->chrootdir) { /* stale pointer */
483 char *chrootdir = consumestr(&serialized, &length);
484 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500485 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400486 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500487 if (!j->chrootdir)
488 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400489 }
490
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800491 if (j->flags.seccomp_filter && j->filter_len > 0) {
492 size_t ninstrs = j->filter_len;
493 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
494 ninstrs > USHRT_MAX)
495 goto bad_filters;
496
497 size_t program_len = ninstrs * sizeof(struct sock_filter);
498 void *program = consumebytes(program_len, &serialized, &length);
499 if (!program)
500 goto bad_filters;
501
502 j->filter_prog = malloc(sizeof(struct sock_fprog));
503 j->filter_prog->len = ninstrs;
504 j->filter_prog->filter = malloc(program_len);
505 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400506 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400507
508 count = j->binding_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400509 j->binding_count = 0;
510 for (i = 0; i < count; ++i) {
511 int *writeable;
512 const char *dest;
513 const char *src = consumestr(&serialized, &length);
514 if (!src)
Will Drewrybee7ba72011-10-21 20:47:01 -0500515 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400516 dest = consumestr(&serialized, &length);
517 if (!dest)
Will Drewrybee7ba72011-10-21 20:47:01 -0500518 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400519 writeable = consumebytes(sizeof(*writeable), &serialized, &length);
520 if (!writeable)
Will Drewrybee7ba72011-10-21 20:47:01 -0500521 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400522 if (minijail_bind(j, src, dest, *writeable))
Will Drewrybee7ba72011-10-21 20:47:01 -0500523 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400524 }
525
Elly Jonese1749eb2011-10-07 13:54:59 -0400526 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500527
528bad_bindings:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800529 if (j->flags.seccomp_filter && j->filter_len > 0) {
530 free(j->filter_prog->filter);
531 free(j->filter_prog);
532 }
Will Drewrybee7ba72011-10-21 20:47:01 -0500533bad_filters:
534 if (j->chrootdir)
535 free(j->chrootdir);
536bad_chrootdir:
537 if (j->user)
538 free(j->user);
539clear_pointers:
540 j->user = NULL;
541 j->chrootdir = NULL;
542out:
543 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500544}
545
Elly Jones51a5b6c2011-10-12 19:09:26 -0400546/* bind_one: Applies bindings from @b for @j, recursing as needed.
547 * @j Minijail these bindings are for
548 * @b Head of list of bindings
549 *
550 * Returns 0 for success.
551 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700552int bind_one(const struct minijail *j, struct binding *b)
553{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400554 int ret = 0;
555 char *dest = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400556 if (ret)
557 return ret;
558 /* dest has a leading "/" */
559 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
560 return -ENOMEM;
Elly Jonesa1059632011-12-15 15:17:07 -0500561 ret = mount(b->src, dest, NULL, MS_BIND, NULL);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400562 if (ret)
563 pdie("bind: %s -> %s", b->src, dest);
Elly Jonesa1059632011-12-15 15:17:07 -0500564 if (!b->writeable) {
565 ret = mount(b->src, dest, NULL,
566 MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
567 if (ret)
568 pdie("bind ro: %s -> %s", b->src, dest);
569 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400570 free(dest);
571 if (b->next)
572 return bind_one(j, b->next);
573 return ret;
574}
575
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700576int enter_chroot(const struct minijail *j)
577{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400578 int ret;
579 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
580 return ret;
581
582 if (chroot(j->chrootdir))
583 return -errno;
584
585 if (chdir("/"))
586 return -errno;
587
588 return 0;
589}
590
Lee Campbell11af0622014-05-22 12:36:04 -0700591int mount_tmp(void)
592{
593 return mount("none", "/tmp", "tmpfs", 0, "size=128M,mode=777");
594}
595
Will Drewry6ac91122011-10-21 16:38:58 -0500596int remount_readonly(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400597{
598 const char *kProcPath = "/proc";
599 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -0500600 /*
601 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -0400602 * /proc in our namespace, which means using MS_REMOUNT here would
603 * mutate our parent's mount as well, even though we're in a VFS
604 * namespace (!). Instead, remove their mount from our namespace
605 * and make our own.
606 */
607 if (umount(kProcPath))
608 return -errno;
609 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
610 return -errno;
611 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400612}
613
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700614void drop_ugid(const struct minijail *j)
615{
616 if (j->flags.usergroups) {
617 if (initgroups(j->user, j->usergid))
618 pdie("initgroups");
619 } else {
620 /* Only attempt to clear supplemental groups if we are changing
621 * users. */
622 if ((j->uid || j->gid) && setgroups(0, NULL))
623 pdie("setgroups");
624 }
625
626 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
627 pdie("setresgid");
628
629 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
630 pdie("setresuid");
631}
632
Mike Frysinger3adfef72013-05-09 17:19:08 -0400633/*
634 * We specifically do not use cap_valid() as that only tells us the last
635 * valid cap we were *compiled* against (i.e. what the version of kernel
636 * headers says). If we run on a different kernel version, then it's not
637 * uncommon for that to be less (if an older kernel) or more (if a newer
638 * kernel). So suck up the answer via /proc.
639 */
640static int run_cap_valid(unsigned int cap)
641{
642 static unsigned int last_cap;
643
644 if (!last_cap) {
645 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
646 FILE *fp = fopen(cap_file, "re");
647 if (fscanf(fp, "%u", &last_cap) != 1)
648 pdie("fscanf(%s)", cap_file);
649 fclose(fp);
650 }
651
652 return cap <= last_cap;
653}
654
Will Drewry6ac91122011-10-21 16:38:58 -0500655void drop_caps(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400656{
657 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -0800658 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -0800659 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400660 unsigned int i;
661 if (!caps)
662 die("can't get process caps");
663 if (cap_clear_flag(caps, CAP_INHERITABLE))
664 die("can't clear inheritable caps");
665 if (cap_clear_flag(caps, CAP_EFFECTIVE))
666 die("can't clear effective caps");
667 if (cap_clear_flag(caps, CAP_PERMITTED))
668 die("can't clear permitted caps");
Mike Frysinger3adfef72013-05-09 17:19:08 -0400669 for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
Kees Cook323878a2013-02-05 15:35:24 -0800670 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800671 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -0400672 continue;
Kees Cook323878a2013-02-05 15:35:24 -0800673 flag[0] = i;
674 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400675 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -0800676 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400677 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -0800678 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400679 die("can't add inheritable cap");
680 }
681 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -0800682 die("can't apply initial cleaned capset");
683
684 /*
685 * Instead of dropping bounding set first, do it here in case
686 * the caller had a more permissive bounding set which could
687 * have been used above to raise a capability that wasn't already
688 * present. This requires CAP_SETPCAP, so we raised/kept it above.
689 */
Mike Frysinger3adfef72013-05-09 17:19:08 -0400690 for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -0800691 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -0400692 continue;
693 if (prctl(PR_CAPBSET_DROP, i))
694 pdie("prctl(PR_CAPBSET_DROP)");
695 }
Kees Cook323878a2013-02-05 15:35:24 -0800696
697 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800698 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -0800699 flag[0] = CAP_SETPCAP;
700 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
701 die("can't clear effective cap");
702 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
703 die("can't clear permitted cap");
704 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
705 die("can't clear inheritable cap");
706 }
707
708 if (cap_set_proc(caps))
709 die("can't apply final cleaned capset");
710
711 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -0400712}
713
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700714void set_seccomp_filter(const struct minijail *j)
715{
716 /*
717 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
718 * in the kernel source tree for an explanation of the parameters.
719 */
720 if (j->flags.no_new_privs) {
721 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
722 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
723 }
724
725 /*
726 * If we're logging seccomp filter failures,
727 * install the SIGSYS handler first.
728 */
729 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
730 if (install_sigsys_handler())
731 pdie("install SIGSYS handler");
732 warn("logging seccomp filter failures");
733 }
734
735 /*
736 * Install the syscall filter.
737 */
738 if (j->flags.seccomp_filter) {
739 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog))
740 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
741 }
742}
743
Will Drewry6ac91122011-10-21 16:38:58 -0500744void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400745{
746 if (j->flags.pids)
747 die("tried to enter a pid-namespaced jail;"
748 "try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -0400749
Elly Jonese1749eb2011-10-07 13:54:59 -0400750 if (j->flags.usergroups && !j->user)
751 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -0400752
Elly Jonesdd3e8512012-01-23 15:13:38 -0500753 /*
754 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -0400755 * so we don't even try. If any of our operations fail, we abort() the
756 * entire process.
757 */
758 if (j->flags.vfs && unshare(CLONE_NEWNS))
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400759 pdie("unshare(vfs)");
760
761 if (j->flags.net && unshare(CLONE_NEWNET))
762 pdie("unshare(net)");
Elly Jonescd7a9042011-07-22 13:56:51 -0400763
Elly Jones51a5b6c2011-10-12 19:09:26 -0400764 if (j->flags.chroot && enter_chroot(j))
765 pdie("chroot");
766
Lee Campbell11af0622014-05-22 12:36:04 -0700767 if (j->flags.chroot && j->flags.mount_tmp && mount_tmp())
768 pdie("mount_tmp");
769
Elly Jonese1749eb2011-10-07 13:54:59 -0400770 if (j->flags.readonly && remount_readonly())
771 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -0400772
Elly Jonese1749eb2011-10-07 13:54:59 -0400773 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -0500774 /*
775 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -0400776 * capability to change uids, our attempt to use setuid()
777 * below will fail. Hang on to root caps across setuid(), then
778 * lock securebits.
779 */
780 if (prctl(PR_SET_KEEPCAPS, 1))
781 pdie("prctl(PR_SET_KEEPCAPS)");
782 if (prctl
783 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
784 pdie("prctl(PR_SET_SECUREBITS)");
785 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400786
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700787 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700788 * If we're setting no_new_privs, we can drop privileges
789 * before setting seccomp filter. This way filter policies
790 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700791 */
792 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700793 drop_ugid(j);
794 if (j->flags.caps)
795 drop_caps(j);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700796
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700797 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -0400798 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700799 /*
800 * If we're not setting no_new_privs,
801 * we need to set seccomp filter *before* dropping privileges.
802 * WARNING: this means that filter policies *must* allow
803 * setgroups()/setresgid()/setresuid() for dropping root and
804 * capget()/capset()/prctl() for dropping caps.
805 */
806 set_seccomp_filter(j);
807
808 drop_ugid(j);
809 if (j->flags.caps)
810 drop_caps(j);
Elly Jonese1749eb2011-10-07 13:54:59 -0400811 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400812
Elly Jonesdd3e8512012-01-23 15:13:38 -0500813 /*
814 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -0400815 * privilege-dropping syscalls :)
816 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400817 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
818 pdie("prctl(PR_SET_SECCOMP)");
Elly Jonescd7a9042011-07-22 13:56:51 -0400819}
820
Will Drewry6ac91122011-10-21 16:38:58 -0500821/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -0400822static int init_exitstatus = 0;
823
Will Drewry6ac91122011-10-21 16:38:58 -0500824void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -0400825{
826 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -0400827}
828
Will Drewry6ac91122011-10-21 16:38:58 -0500829int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400830{
831 pid_t pid;
832 int status;
833 /* so that we exit with the right status */
834 signal(SIGTERM, init_term);
835 /* TODO(wad) self jail with seccomp_filters here. */
836 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -0500837 /*
838 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -0400839 * left inside our pid namespace or we get a signal.
840 */
841 if (pid == rootpid)
842 init_exitstatus = status;
843 }
844 if (!WIFEXITED(init_exitstatus))
845 _exit(MINIJAIL_ERR_INIT);
846 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -0400847}
848
Will Drewry6ac91122011-10-21 16:38:58 -0500849int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400850{
851 size_t sz = 0;
852 size_t bytes = read(fd, &sz, sizeof(sz));
853 char *buf;
854 int r;
855 if (sizeof(sz) != bytes)
856 return -EINVAL;
857 if (sz > USHRT_MAX) /* Arbitrary sanity check */
858 return -E2BIG;
859 buf = malloc(sz);
860 if (!buf)
861 return -ENOMEM;
862 bytes = read(fd, buf, sz);
863 if (bytes != sz) {
864 free(buf);
865 return -EINVAL;
866 }
867 r = minijail_unmarshal(j, buf, sz);
868 free(buf);
869 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500870}
871
Will Drewry6ac91122011-10-21 16:38:58 -0500872int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -0400873{
874 char *buf;
875 size_t sz = minijail_size(j);
876 ssize_t written;
877 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -0400878
Elly Jonese1749eb2011-10-07 13:54:59 -0400879 if (!sz)
880 return -EINVAL;
881 buf = malloc(sz);
882 r = minijail_marshal(j, buf, sz);
883 if (r) {
884 free(buf);
885 return r;
886 }
887 /* Sends [size][minijail]. */
888 written = write(fd, &sz, sizeof(sz));
889 if (written != sizeof(sz)) {
890 free(buf);
891 return -EFAULT;
892 }
893 written = write(fd, buf, sz);
894 if (written < 0 || (size_t) written != sz) {
895 free(buf);
896 return -EFAULT;
897 }
898 free(buf);
899 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500900}
Elly Jonescd7a9042011-07-22 13:56:51 -0400901
Will Drewry6ac91122011-10-21 16:38:58 -0500902int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400903{
904 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
905 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
906 if (!newenv)
907 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -0400908
Elly Jonese1749eb2011-10-07 13:54:59 -0400909 /* Only insert a separating space if we have something to separate... */
910 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
911 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -0400912
Elly Jonese1749eb2011-10-07 13:54:59 -0400913 /* setenv() makes a copy of the string we give it */
914 setenv(kLdPreloadEnvVar, newenv, 1);
915 free(newenv);
916 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400917}
918
Will Drewry6ac91122011-10-21 16:38:58 -0500919int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -0400920{
921 int r = pipe(fds);
922 char fd_buf[11];
923 if (r)
924 return r;
925 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
926 if (r <= 0)
927 return -EINVAL;
928 setenv(kFdEnvVar, fd_buf, 1);
929 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500930}
931
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800932int setup_pipe_end(int fds[2], size_t index)
933{
934 if (index > 1)
935 return -1;
936
937 close(fds[1 - index]);
938 return fds[index];
939}
940
941int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
942{
943 if (index > 1)
944 return -1;
945
946 close(fds[1 - index]);
947 /* dup2(2) the corresponding end of the pipe into |fd|. */
948 return dup2(fds[index], fd);
949}
950
Will Drewry6ac91122011-10-21 16:38:58 -0500951int API minijail_run(struct minijail *j, const char *filename,
952 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -0400953{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800954 return minijail_run_pid_pipes(j, filename, argv,
955 NULL, NULL, NULL, NULL);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -0700956}
957
958int API minijail_run_pid(struct minijail *j, const char *filename,
959 char *const argv[], pid_t *pchild_pid)
960{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800961 return minijail_run_pid_pipes(j, filename, argv, pchild_pid,
962 NULL, NULL, NULL);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -0700963}
964
965int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -0700966 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -0700967{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800968 return minijail_run_pid_pipes(j, filename, argv, NULL, pstdin_fd,
969 NULL, NULL);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -0700970}
971
972int API minijail_run_pid_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -0700973 char *const argv[], pid_t *pchild_pid,
974 int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -0700975{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800976 return minijail_run_pid_pipes(j, filename, argv, pchild_pid, pstdin_fd,
977 NULL, NULL);
978}
979
980int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700981 char *const argv[], pid_t *pchild_pid,
982 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800983{
Elly Jonese1749eb2011-10-07 13:54:59 -0400984 char *oldenv, *oldenv_copy = NULL;
985 pid_t child_pid;
986 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -0700987 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800988 int stdout_fds[2];
989 int stderr_fds[2];
Elly Jonese1749eb2011-10-07 13:54:59 -0400990 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -0400991 /* We need to remember this across the minijail_preexec() call. */
992 int pid_namespace = j->flags.pids;
Ben Chan541c7e52011-08-26 14:55:53 -0700993
Elly Jonese1749eb2011-10-07 13:54:59 -0400994 oldenv = getenv(kLdPreloadEnvVar);
995 if (oldenv) {
996 oldenv_copy = strdup(oldenv);
997 if (!oldenv_copy)
998 return -ENOMEM;
999 }
Will Drewryf89aef52011-09-16 16:48:57 -05001000
Elly Jonese1749eb2011-10-07 13:54:59 -04001001 if (setup_preload())
1002 return -EFAULT;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001003
Elly Jonesdd3e8512012-01-23 15:13:38 -05001004 /*
1005 * Before we fork(2) and execve(2) the child process, we need to open
Elly Jonese1749eb2011-10-07 13:54:59 -04001006 * a pipe(2) to send the minijail configuration over.
1007 */
1008 if (setup_pipe(pipe_fds))
1009 return -EFAULT;
Elly Jonescd7a9042011-07-22 13:56:51 -04001010
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001011 /*
1012 * If we want to write to the child process' standard input,
1013 * create the pipe(2) now.
1014 */
1015 if (pstdin_fd) {
1016 if (pipe(stdin_fds))
1017 return -EFAULT;
1018 }
1019
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001020 /*
1021 * If we want to read from the child process' standard output,
1022 * create the pipe(2) now.
1023 */
1024 if (pstdout_fd) {
1025 if (pipe(stdout_fds))
1026 return -EFAULT;
1027 }
1028
1029 /*
1030 * If we want to read from the child process' standard error,
1031 * create the pipe(2) now.
1032 */
1033 if (pstderr_fd) {
1034 if (pipe(stderr_fds))
1035 return -EFAULT;
1036 }
1037
Elly Jones761b7412012-06-13 15:49:52 -04001038 /* Use sys_clone() if and only if we're creating a pid namespace.
1039 *
1040 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1041 *
1042 * In multithreaded programs, there are a bunch of locks inside libc,
1043 * some of which may be held by other threads at the time that we call
1044 * minijail_run_pid(). If we call fork(), glibc does its level best to
1045 * ensure that we hold all of these locks before it calls clone()
1046 * internally and drop them after clone() returns, but when we call
1047 * sys_clone(2) directly, all that gets bypassed and we end up with a
1048 * child address space where some of libc's important locks are held by
1049 * other threads (which did not get cloned, and hence will never release
1050 * those locks). This is okay so long as we call exec() immediately
1051 * after, but a bunch of seemingly-innocent libc functions like setenv()
1052 * take locks.
1053 *
1054 * Hence, only call sys_clone() if we need to, in order to get at pid
1055 * namespacing. If we follow this path, the child's address space might
1056 * have broken locks; you may only call functions that do not acquire
1057 * any locks.
1058 *
1059 * Unfortunately, fork() acquires every lock it can get its hands on, as
1060 * previously detailed, so this function is highly likely to deadlock
1061 * later on (see "deadlock here") if we're multithreaded.
1062 *
1063 * We might hack around this by having the clone()d child (init of the
1064 * pid namespace) return directly, rather than leaving the clone()d
1065 * process hanging around to be init for the new namespace (and having
1066 * its fork()ed child return in turn), but that process would be crippled
1067 * with its libc locks potentially broken. We might try fork()ing in the
1068 * parent before we clone() to ensure that we own all the locks, but
1069 * then we have to have the forked child hanging around consuming
1070 * resources (and possibly having file descriptors / shared memory
1071 * regions / etc attached). We'd need to keep the child around to avoid
1072 * having its children get reparented to init.
1073 *
1074 * TODO(ellyjones): figure out if the "forked child hanging around"
1075 * problem is fixable or not. It would be nice if we worked in this
1076 * case.
1077 */
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001078 if (pid_namespace)
Elly Jones761b7412012-06-13 15:49:52 -04001079 child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1080 else
1081 child_pid = fork();
1082
Elly Jonese1749eb2011-10-07 13:54:59 -04001083 if (child_pid < 0) {
1084 free(oldenv_copy);
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001085 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001086 }
Will Drewryf89aef52011-09-16 16:48:57 -05001087
Elly Jonese1749eb2011-10-07 13:54:59 -04001088 if (child_pid) {
1089 /* Restore parent's LD_PRELOAD. */
1090 if (oldenv_copy) {
1091 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1092 free(oldenv_copy);
1093 } else {
1094 unsetenv(kLdPreloadEnvVar);
1095 }
1096 unsetenv(kFdEnvVar);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001097
Elly Jonese1749eb2011-10-07 13:54:59 -04001098 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001099
1100 /* Send marshalled minijail. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001101 close(pipe_fds[0]); /* read endpoint */
1102 ret = minijail_to_fd(j, pipe_fds[1]);
1103 close(pipe_fds[1]); /* write endpoint */
1104 if (ret) {
1105 kill(j->initpid, SIGKILL);
1106 die("failed to send marshalled minijail");
1107 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001108
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001109 if (pchild_pid)
1110 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001111
1112 /*
1113 * If we want to write to the child process' standard input,
1114 * set up the write end of the pipe.
1115 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001116 if (pstdin_fd)
1117 *pstdin_fd = setup_pipe_end(stdin_fds,
1118 1 /* write end */);
1119
1120 /*
1121 * If we want to read from the child process' standard output,
1122 * set up the read end of the pipe.
1123 */
1124 if (pstdout_fd)
1125 *pstdout_fd = setup_pipe_end(stdout_fds,
1126 0 /* read end */);
1127
1128 /*
1129 * If we want to read from the child process' standard error,
1130 * set up the read end of the pipe.
1131 */
1132 if (pstderr_fd)
1133 *pstderr_fd = setup_pipe_end(stderr_fds,
1134 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001135
Elly Jonese1749eb2011-10-07 13:54:59 -04001136 return 0;
1137 }
1138 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001139
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001140 /*
1141 * If we want to write to the jailed process' standard input,
1142 * set up the read end of the pipe.
1143 */
1144 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001145 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1146 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001147 die("failed to set up stdin pipe");
1148 }
1149
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001150 /*
1151 * If we want to read from the jailed process' standard output,
1152 * set up the write end of the pipe.
1153 */
1154 if (pstdout_fd) {
1155 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1156 STDOUT_FILENO) < 0)
1157 die("failed to set up stdout pipe");
1158 }
1159
1160 /*
1161 * If we want to read from the jailed process' standard error,
1162 * set up the write end of the pipe.
1163 */
1164 if (pstderr_fd) {
1165 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1166 STDERR_FILENO) < 0)
1167 die("failed to set up stderr pipe");
1168 }
1169
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001170 /* Strip out flags that cannot be inherited across execve. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001171 minijail_preexec(j);
1172 /* Jail this process and its descendants... */
1173 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001174
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001175 if (pid_namespace) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001176 /*
1177 * pid namespace: this process will become init inside the new
Elly Jonese1749eb2011-10-07 13:54:59 -04001178 * namespace, so fork off a child to actually run the program
1179 * (we don't want all programs we might exec to have to know
1180 * how to be init).
Elly Jones761b7412012-06-13 15:49:52 -04001181 *
1182 * If we're multithreaded, we'll probably deadlock here. See
1183 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001184 */
1185 child_pid = fork();
1186 if (child_pid < 0)
1187 _exit(child_pid);
1188 else if (child_pid > 0)
1189 init(child_pid); /* never returns */
1190 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001191
Elly Jonesdd3e8512012-01-23 15:13:38 -05001192 /*
1193 * If we aren't pid-namespaced:
Elly Jonese1749eb2011-10-07 13:54:59 -04001194 * calling process
1195 * -> execve()-ing process
1196 * If we are:
1197 * calling process
1198 * -> init()-ing process
1199 * -> execve()-ing process
1200 */
1201 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001202}
1203
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001204int API minijail_run_static(struct minijail *j, const char *filename,
1205 char *const argv[])
1206{
1207 pid_t child_pid;
1208 int pid_namespace = j->flags.pids;
1209
1210 if (j->flags.caps)
1211 die("caps not supported with static targets");
1212
1213 if (pid_namespace)
1214 child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1215 else
1216 child_pid = fork();
1217
1218 if (child_pid < 0) {
1219 die("failed to fork child");
1220 }
1221 if (child_pid > 0 ) {
1222 j->initpid = child_pid;
1223 return 0;
1224 }
1225
1226 /*
1227 * We can now drop this child into the sandbox
1228 * then execve the target.
1229 */
1230
1231 j->flags.pids = 0;
1232 minijail_enter(j);
1233
1234 if (pid_namespace) {
1235 /*
1236 * pid namespace: this process will become init inside the new
1237 * namespace, so fork off a child to actually run the program
1238 * (we don't want all programs we might exec to have to know
1239 * how to be init).
1240 *
1241 * If we're multithreaded, we'll probably deadlock here. See
1242 * WARNING above.
1243 */
1244 child_pid = fork();
1245 if (child_pid < 0)
1246 _exit(child_pid);
1247 else if (child_pid > 0)
1248 init(child_pid); /* never returns */
1249 }
1250
1251 _exit(execve(filename, argv, environ));
1252}
1253
Will Drewry6ac91122011-10-21 16:38:58 -05001254int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001255{
1256 int st;
1257 if (kill(j->initpid, SIGTERM))
1258 return -errno;
1259 if (waitpid(j->initpid, &st, 0) < 0)
1260 return -errno;
1261 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001262}
1263
Will Drewry6ac91122011-10-21 16:38:58 -05001264int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001265{
1266 int st;
1267 if (waitpid(j->initpid, &st, 0) < 0)
1268 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001269
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001270 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001271 int error_status = st;
1272 if (WIFSIGNALED(st)) {
1273 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001274 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001275 j->initpid, signum);
1276 /*
1277 * We return MINIJAIL_ERR_JAIL if the process received
1278 * SIGSYS, which happens when a syscall is blocked by
1279 * seccomp filters.
1280 * If not, we do what bash(1) does:
1281 * $? = 128 + signum
1282 */
1283 if (signum == SIGSYS) {
1284 error_status = MINIJAIL_ERR_JAIL;
1285 } else {
1286 error_status = 128 + signum;
1287 }
1288 }
1289 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001290 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001291
1292 int exit_status = WEXITSTATUS(st);
1293 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001294 info("child process %d exited with status %d",
1295 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001296
1297 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001298}
1299
Will Drewry6ac91122011-10-21 16:38:58 -05001300void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001301{
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001302 if (j->flags.seccomp_filter && j->filter_prog) {
1303 free(j->filter_prog->filter);
1304 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001305 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001306 while (j->bindings_head) {
1307 struct binding *b = j->bindings_head;
1308 j->bindings_head = j->bindings_head->next;
1309 free(b->dest);
1310 free(b->src);
1311 free(b);
1312 }
1313 j->bindings_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001314 if (j->user)
1315 free(j->user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001316 if (j->chrootdir)
1317 free(j->chrootdir);
Elly Jonese1749eb2011-10-07 13:54:59 -04001318 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001319}