blob: 7a188317f8cac3d71bcf89727b0d6315c8404d05 [file] [log] [blame]
Elly Jonesdd3e8512012-01-23 15:13:38 -05001/*
2 * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04003 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05004 * found in the LICENSE file.
5 */
Elly Jonescd7a9042011-07-22 13:56:51 -04006
7#define _BSD_SOURCE
8#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07009
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080010#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050011#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040012#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070013#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040014#include <grp.h>
15#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050016#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040018#include <pwd.h>
19#include <sched.h>
20#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050021#include <stdarg.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080022#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040023#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050029#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040030#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070031#include <sys/stat.h>
32#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080033#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040034#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <unistd.h>
36
37#include "libminijail.h"
38#include "libminijail-private.h"
39
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070040#include "signal.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080041#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070042#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080043
Lei Zhangeee31552012-10-17 21:27:10 -070044#ifdef HAVE_SECUREBITS_H
45#include <linux/securebits.h>
46#else
47#define SECURE_ALL_BITS 0x15
48#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
49#endif
50
Will Drewry32ac9f52011-08-18 21:36:27 -050051/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080052#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070053# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080054#endif
55
56/* For seccomp_filter using BPF. */
57#ifndef PR_SET_NO_NEW_PRIVS
58# define PR_SET_NO_NEW_PRIVS 38
59#endif
60#ifndef SECCOMP_MODE_FILTER
61# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050062#endif
63
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -070064#ifdef USE_SECCOMP_SOFTFAIL
65# define SECCOMP_SOFTFAIL 1
66#else
67# define SECCOMP_SOFTFAIL 0
68#endif
69
Elly Jones51a5b6c2011-10-12 19:09:26 -040070struct binding {
71 char *src;
72 char *dest;
73 int writeable;
74 struct binding *next;
75};
76
Will Drewryf89aef52011-09-16 16:48:57 -050077struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070078 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070079 * WARNING: if you add a flag here you need to make sure it's
80 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070081 */
Elly Jonese1749eb2011-10-07 13:54:59 -040082 struct {
83 int uid:1;
84 int gid:1;
85 int caps:1;
86 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070087 int enter_vfs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040088 int pids:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -040089 int net:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040090 int seccomp:1;
91 int readonly:1;
92 int usergroups:1;
93 int ptrace:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070094 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040095 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070096 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -040097 int chroot:1;
Lee Campbell11af0622014-05-22 12:36:04 -070098 int mount_tmp:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040099 } flags;
100 uid_t uid;
101 gid_t gid;
102 gid_t usergid;
103 char *user;
104 uint64_t caps;
105 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700106 int mountns_fd;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800107 int filter_len;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400108 int binding_count;
109 char *chrootdir;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800110 struct sock_fprog *filter_prog;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400111 struct binding *bindings_head;
112 struct binding *bindings_tail;
Will Drewryf89aef52011-09-16 16:48:57 -0500113};
114
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700115/*
116 * Strip out flags meant for the parent.
117 * We keep things that are not inherited across execve(2) (e.g. capabilities),
118 * or are easier to set after execve(2) (e.g. seccomp filters).
119 */
120void minijail_preenter(struct minijail *j)
121{
122 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700123 j->flags.enter_vfs = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700124 j->flags.readonly = 0;
125 j->flags.pids = 0;
126}
127
128/*
129 * Strip out flags meant for the child.
130 * We keep things that are inherited across execve(2).
131 */
132void minijail_preexec(struct minijail *j)
133{
134 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700135 int enter_vfs = j->flags.enter_vfs;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700136 int readonly = j->flags.readonly;
137 if (j->user)
138 free(j->user);
139 j->user = NULL;
140 memset(&j->flags, 0, sizeof(j->flags));
141 /* Now restore anything we meant to keep. */
142 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700143 j->flags.enter_vfs = enter_vfs;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700144 j->flags.readonly = readonly;
145 /* Note, |pids| will already have been used before this call. */
146}
147
148/* Minijail API. */
149
Will Drewry6ac91122011-10-21 16:38:58 -0500150struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400151{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400152 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400153}
154
Will Drewry6ac91122011-10-21 16:38:58 -0500155void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400156{
157 if (uid == 0)
158 die("useless change to uid 0");
159 j->uid = uid;
160 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400161}
162
Will Drewry6ac91122011-10-21 16:38:58 -0500163void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400164{
165 if (gid == 0)
166 die("useless change to gid 0");
167 j->gid = gid;
168 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400169}
170
Will Drewry6ac91122011-10-21 16:38:58 -0500171int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400172{
173 char *buf = NULL;
174 struct passwd pw;
175 struct passwd *ppw = NULL;
176 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
177 if (sz == -1)
178 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400179
Elly Jonesdd3e8512012-01-23 15:13:38 -0500180 /*
181 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400182 * the maximum needed size of the buffer, so we don't have to search.
183 */
184 buf = malloc(sz);
185 if (!buf)
186 return -ENOMEM;
187 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500188 /*
189 * We're safe to free the buffer here. The strings inside pw point
190 * inside buf, but we don't use any of them; this leaves the pointers
191 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
192 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400193 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700194 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400195 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700196 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400197 minijail_change_uid(j, ppw->pw_uid);
198 j->user = strdup(user);
199 if (!j->user)
200 return -ENOMEM;
201 j->usergid = ppw->pw_gid;
202 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400203}
204
Will Drewry6ac91122011-10-21 16:38:58 -0500205int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400206{
207 char *buf = NULL;
208 struct group gr;
209 struct group *pgr = NULL;
210 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
211 if (sz == -1)
212 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400213
Elly Jonesdd3e8512012-01-23 15:13:38 -0500214 /*
215 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400216 * the maximum needed size of the buffer, so we don't have to search.
217 */
218 buf = malloc(sz);
219 if (!buf)
220 return -ENOMEM;
221 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500222 /*
223 * We're safe to free the buffer here. The strings inside gr point
224 * inside buf, but we don't use any of them; this leaves the pointers
225 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
226 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400227 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700228 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400229 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700230 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400231 minijail_change_gid(j, pgr->gr_gid);
232 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400233}
234
Will Drewry6ac91122011-10-21 16:38:58 -0500235void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400236{
237 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400238}
239
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700240void API minijail_no_new_privs(struct minijail *j)
241{
242 j->flags.no_new_privs = 1;
243}
244
Will Drewry6ac91122011-10-21 16:38:58 -0500245void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400246{
247 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500248}
249
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700250void API minijail_log_seccomp_filter_failures(struct minijail *j)
251{
252 j->flags.log_seccomp_filter = 1;
253}
254
Will Drewry6ac91122011-10-21 16:38:58 -0500255void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400256{
257 j->caps = capmask;
258 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400259}
260
Will Drewry6ac91122011-10-21 16:38:58 -0500261void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400262{
263 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400264}
265
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700266void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
267{
268 int ns_fd = open(ns_path, O_RDONLY);
269 if (ns_fd < 0) {
270 pdie("failed to open namespace '%s'", ns_path);
271 }
272 j->mountns_fd = ns_fd;
273 j->flags.enter_vfs = 1;
274}
275
Will Drewry6ac91122011-10-21 16:38:58 -0500276void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400277{
Elly Jonese58176c2012-01-23 11:46:17 -0500278 j->flags.vfs = 1;
279 j->flags.readonly = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400280 j->flags.pids = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400281}
282
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400283void API minijail_namespace_net(struct minijail *j)
284{
285 j->flags.net = 1;
286}
287
Will Drewry6ac91122011-10-21 16:38:58 -0500288void API minijail_remount_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400289{
290 j->flags.vfs = 1;
291 j->flags.readonly = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400292}
293
Will Drewry6ac91122011-10-21 16:38:58 -0500294void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400295{
296 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400297}
298
Will Drewry6ac91122011-10-21 16:38:58 -0500299void API minijail_disable_ptrace(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400300{
301 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400302}
303
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700304int API minijail_enter_chroot(struct minijail *j, const char *dir)
305{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400306 if (j->chrootdir)
307 return -EINVAL;
308 j->chrootdir = strdup(dir);
309 if (!j->chrootdir)
310 return -ENOMEM;
311 j->flags.chroot = 1;
312 return 0;
313}
314
Lee Campbell11af0622014-05-22 12:36:04 -0700315void API minijail_mount_tmp(struct minijail *j)
316{
317 j->flags.mount_tmp = 1;
318}
319
Will Drewry6ac91122011-10-21 16:38:58 -0500320int API minijail_bind(struct minijail *j, const char *src, const char *dest,
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700321 int writeable)
322{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400323 struct binding *b;
324
325 if (*dest != '/')
326 return -EINVAL;
327 b = calloc(1, sizeof(*b));
328 if (!b)
329 return -ENOMEM;
330 b->dest = strdup(dest);
331 if (!b->dest)
332 goto error;
333 b->src = strdup(src);
334 if (!b->src)
335 goto error;
336 b->writeable = writeable;
337
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700338 info("bind %s -> %s", src, dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400339
Elly Jonesdd3e8512012-01-23 15:13:38 -0500340 /*
341 * Force vfs namespacing so the bind mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400342 * containing vfs namespace.
343 */
344 minijail_namespace_vfs(j);
345
346 if (j->bindings_tail)
347 j->bindings_tail->next = b;
348 else
349 j->bindings_head = b;
350 j->bindings_tail = b;
351 j->binding_count++;
352
353 return 0;
354
355error:
356 free(b->src);
357 free(b->dest);
358 free(b);
359 return -ENOMEM;
360}
361
Will Drewry6ac91122011-10-21 16:38:58 -0500362void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400363{
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700364 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
365 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
366 warn("not loading seccomp filter, seccomp not supported");
367 return;
368 }
369 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400370 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800371 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700372 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400373 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800374
375 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700376 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
377 die("failed to compile seccomp filter BPF program in '%s'",
378 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800379 }
380
381 j->filter_len = fprog->len;
382 j->filter_prog = fprog;
383
Elly Jonese1749eb2011-10-07 13:54:59 -0400384 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500385}
386
Will Drewryf89aef52011-09-16 16:48:57 -0500387struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400388 size_t available;
389 size_t total;
390 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500391};
392
Will Drewry6ac91122011-10-21 16:38:58 -0500393void marshal_state_init(struct marshal_state *state,
394 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400395{
396 state->available = available;
397 state->buf = buf;
398 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500399}
400
Will Drewry6ac91122011-10-21 16:38:58 -0500401void marshal_append(struct marshal_state *state,
402 char *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400403{
404 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500405
Elly Jonese1749eb2011-10-07 13:54:59 -0400406 /* Up to |available| will be written. */
407 if (copy_len) {
408 memcpy(state->buf, src, copy_len);
409 state->buf += copy_len;
410 state->available -= copy_len;
411 }
412 /* |total| will contain the expected length. */
413 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500414}
415
Will Drewry6ac91122011-10-21 16:38:58 -0500416void minijail_marshal_helper(struct marshal_state *state,
417 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400418{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400419 struct binding *b = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400420 marshal_append(state, (char *)j, sizeof(*j));
421 if (j->user)
422 marshal_append(state, j->user, strlen(j->user) + 1);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400423 if (j->chrootdir)
424 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800425 if (j->flags.seccomp_filter && j->filter_prog) {
426 struct sock_fprog *fp = j->filter_prog;
427 marshal_append(state, (char *)fp->filter,
428 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400429 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400430 for (b = j->bindings_head; b; b = b->next) {
431 marshal_append(state, b->src, strlen(b->src) + 1);
432 marshal_append(state, b->dest, strlen(b->dest) + 1);
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700433 marshal_append(state, (char *)&b->writeable,
434 sizeof(b->writeable));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400435 }
Will Drewryf89aef52011-09-16 16:48:57 -0500436}
437
Will Drewry6ac91122011-10-21 16:38:58 -0500438size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400439{
440 struct marshal_state state;
441 marshal_state_init(&state, NULL, 0);
442 minijail_marshal_helper(&state, j);
443 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500444}
445
Elly Jonese1749eb2011-10-07 13:54:59 -0400446int minijail_marshal(const struct minijail *j, char *buf, size_t available)
447{
448 struct marshal_state state;
449 marshal_state_init(&state, buf, available);
450 minijail_marshal_helper(&state, j);
451 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500452}
453
Elly Jones51a5b6c2011-10-12 19:09:26 -0400454/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
455 * @length Number of bytes to consume
456 * @buf Buffer to consume from
457 * @buflength Size of @buf
458 *
459 * Returns a pointer to the base of the bytes, or NULL for errors.
460 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700461void *consumebytes(size_t length, char **buf, size_t *buflength)
462{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400463 char *p = *buf;
464 if (length > *buflength)
465 return NULL;
466 *buf += length;
467 *buflength -= length;
468 return p;
469}
470
471/* consumestr: consumes a C string from a buffer @buf of length @length
472 * @buf Buffer to consume
473 * @length Length of buffer
474 *
475 * Returns a pointer to the base of the string, or NULL for errors.
476 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700477char *consumestr(char **buf, size_t *buflength)
478{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400479 size_t len = strnlen(*buf, *buflength);
480 if (len == *buflength)
481 /* There's no null-terminator */
482 return NULL;
483 return consumebytes(len + 1, buf, buflength);
484}
485
Elly Jonese1749eb2011-10-07 13:54:59 -0400486int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
487{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400488 int i;
489 int count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500490 int ret = -EINVAL;
491
Elly Jonese1749eb2011-10-07 13:54:59 -0400492 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500493 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400494 memcpy((void *)j, serialized, sizeof(*j));
495 serialized += sizeof(*j);
496 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500497
Will Drewrybee7ba72011-10-21 20:47:01 -0500498 /* Potentially stale pointers not used as signals. */
499 j->bindings_head = NULL;
500 j->bindings_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800501 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500502
Elly Jonese1749eb2011-10-07 13:54:59 -0400503 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400504 char *user = consumestr(&serialized, &length);
505 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500506 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400507 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500508 if (!j->user)
509 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400510 }
Will Drewryf89aef52011-09-16 16:48:57 -0500511
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400512 if (j->chrootdir) { /* stale pointer */
513 char *chrootdir = consumestr(&serialized, &length);
514 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500515 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400516 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500517 if (!j->chrootdir)
518 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400519 }
520
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800521 if (j->flags.seccomp_filter && j->filter_len > 0) {
522 size_t ninstrs = j->filter_len;
523 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
524 ninstrs > USHRT_MAX)
525 goto bad_filters;
526
527 size_t program_len = ninstrs * sizeof(struct sock_filter);
528 void *program = consumebytes(program_len, &serialized, &length);
529 if (!program)
530 goto bad_filters;
531
532 j->filter_prog = malloc(sizeof(struct sock_fprog));
533 j->filter_prog->len = ninstrs;
534 j->filter_prog->filter = malloc(program_len);
535 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400536 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400537
538 count = j->binding_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400539 j->binding_count = 0;
540 for (i = 0; i < count; ++i) {
541 int *writeable;
542 const char *dest;
543 const char *src = consumestr(&serialized, &length);
544 if (!src)
Will Drewrybee7ba72011-10-21 20:47:01 -0500545 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400546 dest = consumestr(&serialized, &length);
547 if (!dest)
Will Drewrybee7ba72011-10-21 20:47:01 -0500548 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400549 writeable = consumebytes(sizeof(*writeable), &serialized, &length);
550 if (!writeable)
Will Drewrybee7ba72011-10-21 20:47:01 -0500551 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400552 if (minijail_bind(j, src, dest, *writeable))
Will Drewrybee7ba72011-10-21 20:47:01 -0500553 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400554 }
555
Elly Jonese1749eb2011-10-07 13:54:59 -0400556 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500557
558bad_bindings:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800559 if (j->flags.seccomp_filter && j->filter_len > 0) {
560 free(j->filter_prog->filter);
561 free(j->filter_prog);
562 }
Will Drewrybee7ba72011-10-21 20:47:01 -0500563bad_filters:
564 if (j->chrootdir)
565 free(j->chrootdir);
566bad_chrootdir:
567 if (j->user)
568 free(j->user);
569clear_pointers:
570 j->user = NULL;
571 j->chrootdir = NULL;
572out:
573 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500574}
575
Elly Jones51a5b6c2011-10-12 19:09:26 -0400576/* bind_one: Applies bindings from @b for @j, recursing as needed.
577 * @j Minijail these bindings are for
578 * @b Head of list of bindings
579 *
580 * Returns 0 for success.
581 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700582int bind_one(const struct minijail *j, struct binding *b)
583{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400584 int ret = 0;
585 char *dest = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400586 if (ret)
587 return ret;
588 /* dest has a leading "/" */
589 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
590 return -ENOMEM;
Elly Jonesa1059632011-12-15 15:17:07 -0500591 ret = mount(b->src, dest, NULL, MS_BIND, NULL);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400592 if (ret)
593 pdie("bind: %s -> %s", b->src, dest);
Elly Jonesa1059632011-12-15 15:17:07 -0500594 if (!b->writeable) {
595 ret = mount(b->src, dest, NULL,
Jorge Lucangeli Obes2f61ee42014-06-16 11:08:18 -0700596 MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
Elly Jonesa1059632011-12-15 15:17:07 -0500597 if (ret)
598 pdie("bind ro: %s -> %s", b->src, dest);
599 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400600 free(dest);
601 if (b->next)
602 return bind_one(j, b->next);
603 return ret;
604}
605
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700606int enter_chroot(const struct minijail *j)
607{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400608 int ret;
609 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
610 return ret;
611
612 if (chroot(j->chrootdir))
613 return -errno;
614
615 if (chdir("/"))
616 return -errno;
617
618 return 0;
619}
620
Lee Campbell11af0622014-05-22 12:36:04 -0700621int mount_tmp(void)
622{
623 return mount("none", "/tmp", "tmpfs", 0, "size=128M,mode=777");
624}
625
Will Drewry6ac91122011-10-21 16:38:58 -0500626int remount_readonly(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400627{
628 const char *kProcPath = "/proc";
629 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -0500630 /*
631 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -0400632 * /proc in our namespace, which means using MS_REMOUNT here would
633 * mutate our parent's mount as well, even though we're in a VFS
634 * namespace (!). Instead, remove their mount from our namespace
635 * and make our own.
636 */
637 if (umount(kProcPath))
638 return -errno;
639 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
640 return -errno;
641 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400642}
643
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700644void drop_ugid(const struct minijail *j)
645{
646 if (j->flags.usergroups) {
647 if (initgroups(j->user, j->usergid))
648 pdie("initgroups");
649 } else {
650 /* Only attempt to clear supplemental groups if we are changing
651 * users. */
652 if ((j->uid || j->gid) && setgroups(0, NULL))
653 pdie("setgroups");
654 }
655
656 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
657 pdie("setresgid");
658
659 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
660 pdie("setresuid");
661}
662
Mike Frysinger3adfef72013-05-09 17:19:08 -0400663/*
664 * We specifically do not use cap_valid() as that only tells us the last
665 * valid cap we were *compiled* against (i.e. what the version of kernel
666 * headers says). If we run on a different kernel version, then it's not
667 * uncommon for that to be less (if an older kernel) or more (if a newer
668 * kernel). So suck up the answer via /proc.
669 */
670static int run_cap_valid(unsigned int cap)
671{
672 static unsigned int last_cap;
673
674 if (!last_cap) {
675 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
676 FILE *fp = fopen(cap_file, "re");
677 if (fscanf(fp, "%u", &last_cap) != 1)
678 pdie("fscanf(%s)", cap_file);
679 fclose(fp);
680 }
681
682 return cap <= last_cap;
683}
684
Will Drewry6ac91122011-10-21 16:38:58 -0500685void drop_caps(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400686{
687 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -0800688 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -0800689 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400690 unsigned int i;
691 if (!caps)
692 die("can't get process caps");
693 if (cap_clear_flag(caps, CAP_INHERITABLE))
694 die("can't clear inheritable caps");
695 if (cap_clear_flag(caps, CAP_EFFECTIVE))
696 die("can't clear effective caps");
697 if (cap_clear_flag(caps, CAP_PERMITTED))
698 die("can't clear permitted caps");
Mike Frysinger3adfef72013-05-09 17:19:08 -0400699 for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
Kees Cook323878a2013-02-05 15:35:24 -0800700 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800701 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -0400702 continue;
Kees Cook323878a2013-02-05 15:35:24 -0800703 flag[0] = i;
704 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400705 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -0800706 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400707 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -0800708 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400709 die("can't add inheritable cap");
710 }
711 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -0800712 die("can't apply initial cleaned capset");
713
714 /*
715 * Instead of dropping bounding set first, do it here in case
716 * the caller had a more permissive bounding set which could
717 * have been used above to raise a capability that wasn't already
718 * present. This requires CAP_SETPCAP, so we raised/kept it above.
719 */
Mike Frysinger3adfef72013-05-09 17:19:08 -0400720 for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -0800721 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -0400722 continue;
723 if (prctl(PR_CAPBSET_DROP, i))
724 pdie("prctl(PR_CAPBSET_DROP)");
725 }
Kees Cook323878a2013-02-05 15:35:24 -0800726
727 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800728 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -0800729 flag[0] = CAP_SETPCAP;
730 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
731 die("can't clear effective cap");
732 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
733 die("can't clear permitted cap");
734 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
735 die("can't clear inheritable cap");
736 }
737
738 if (cap_set_proc(caps))
739 die("can't apply final cleaned capset");
740
741 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -0400742}
743
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700744void set_seccomp_filter(const struct minijail *j)
745{
746 /*
747 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
748 * in the kernel source tree for an explanation of the parameters.
749 */
750 if (j->flags.no_new_privs) {
751 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
752 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
753 }
754
755 /*
756 * If we're logging seccomp filter failures,
757 * install the SIGSYS handler first.
758 */
759 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
760 if (install_sigsys_handler())
761 pdie("install SIGSYS handler");
762 warn("logging seccomp filter failures");
763 }
764
765 /*
766 * Install the syscall filter.
767 */
768 if (j->flags.seccomp_filter) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700769 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
770 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
771 warn("seccomp not supported");
772 return;
773 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700774 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700775 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700776 }
777}
778
Will Drewry6ac91122011-10-21 16:38:58 -0500779void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400780{
781 if (j->flags.pids)
782 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700783 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -0400784
Elly Jonese1749eb2011-10-07 13:54:59 -0400785 if (j->flags.usergroups && !j->user)
786 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -0400787
Elly Jonesdd3e8512012-01-23 15:13:38 -0500788 /*
789 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -0400790 * so we don't even try. If any of our operations fail, we abort() the
791 * entire process.
792 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700793 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
794 pdie("setns(CLONE_NEWNS)");
795
Elly Jonese1749eb2011-10-07 13:54:59 -0400796 if (j->flags.vfs && unshare(CLONE_NEWNS))
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400797 pdie("unshare(vfs)");
798
799 if (j->flags.net && unshare(CLONE_NEWNET))
800 pdie("unshare(net)");
Elly Jonescd7a9042011-07-22 13:56:51 -0400801
Elly Jones51a5b6c2011-10-12 19:09:26 -0400802 if (j->flags.chroot && enter_chroot(j))
803 pdie("chroot");
804
Lee Campbell11af0622014-05-22 12:36:04 -0700805 if (j->flags.chroot && j->flags.mount_tmp && mount_tmp())
806 pdie("mount_tmp");
807
Elly Jonese1749eb2011-10-07 13:54:59 -0400808 if (j->flags.readonly && remount_readonly())
809 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -0400810
Elly Jonese1749eb2011-10-07 13:54:59 -0400811 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -0500812 /*
813 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -0400814 * capability to change uids, our attempt to use setuid()
815 * below will fail. Hang on to root caps across setuid(), then
816 * lock securebits.
817 */
818 if (prctl(PR_SET_KEEPCAPS, 1))
819 pdie("prctl(PR_SET_KEEPCAPS)");
820 if (prctl
821 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
822 pdie("prctl(PR_SET_SECUREBITS)");
823 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400824
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700825 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700826 * If we're setting no_new_privs, we can drop privileges
827 * before setting seccomp filter. This way filter policies
828 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700829 */
830 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700831 drop_ugid(j);
832 if (j->flags.caps)
833 drop_caps(j);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700834
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700835 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -0400836 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700837 /*
838 * If we're not setting no_new_privs,
839 * we need to set seccomp filter *before* dropping privileges.
840 * WARNING: this means that filter policies *must* allow
841 * setgroups()/setresgid()/setresuid() for dropping root and
842 * capget()/capset()/prctl() for dropping caps.
843 */
844 set_seccomp_filter(j);
845
846 drop_ugid(j);
847 if (j->flags.caps)
848 drop_caps(j);
Elly Jonese1749eb2011-10-07 13:54:59 -0400849 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400850
Elly Jonesdd3e8512012-01-23 15:13:38 -0500851 /*
852 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -0400853 * privilege-dropping syscalls :)
854 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700855 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
856 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
857 warn("seccomp not supported");
858 return;
859 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400860 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700861 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400862}
863
Will Drewry6ac91122011-10-21 16:38:58 -0500864/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -0400865static int init_exitstatus = 0;
866
Will Drewry6ac91122011-10-21 16:38:58 -0500867void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -0400868{
869 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -0400870}
871
Will Drewry6ac91122011-10-21 16:38:58 -0500872int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400873{
874 pid_t pid;
875 int status;
876 /* so that we exit with the right status */
877 signal(SIGTERM, init_term);
878 /* TODO(wad) self jail with seccomp_filters here. */
879 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -0500880 /*
881 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -0400882 * left inside our pid namespace or we get a signal.
883 */
884 if (pid == rootpid)
885 init_exitstatus = status;
886 }
887 if (!WIFEXITED(init_exitstatus))
888 _exit(MINIJAIL_ERR_INIT);
889 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -0400890}
891
Will Drewry6ac91122011-10-21 16:38:58 -0500892int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400893{
894 size_t sz = 0;
895 size_t bytes = read(fd, &sz, sizeof(sz));
896 char *buf;
897 int r;
898 if (sizeof(sz) != bytes)
899 return -EINVAL;
900 if (sz > USHRT_MAX) /* Arbitrary sanity check */
901 return -E2BIG;
902 buf = malloc(sz);
903 if (!buf)
904 return -ENOMEM;
905 bytes = read(fd, buf, sz);
906 if (bytes != sz) {
907 free(buf);
908 return -EINVAL;
909 }
910 r = minijail_unmarshal(j, buf, sz);
911 free(buf);
912 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500913}
914
Will Drewry6ac91122011-10-21 16:38:58 -0500915int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -0400916{
917 char *buf;
918 size_t sz = minijail_size(j);
919 ssize_t written;
920 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -0400921
Elly Jonese1749eb2011-10-07 13:54:59 -0400922 if (!sz)
923 return -EINVAL;
924 buf = malloc(sz);
925 r = minijail_marshal(j, buf, sz);
926 if (r) {
927 free(buf);
928 return r;
929 }
930 /* Sends [size][minijail]. */
931 written = write(fd, &sz, sizeof(sz));
932 if (written != sizeof(sz)) {
933 free(buf);
934 return -EFAULT;
935 }
936 written = write(fd, buf, sz);
937 if (written < 0 || (size_t) written != sz) {
938 free(buf);
939 return -EFAULT;
940 }
941 free(buf);
942 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500943}
Elly Jonescd7a9042011-07-22 13:56:51 -0400944
Will Drewry6ac91122011-10-21 16:38:58 -0500945int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400946{
947 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
948 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
949 if (!newenv)
950 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -0400951
Elly Jonese1749eb2011-10-07 13:54:59 -0400952 /* Only insert a separating space if we have something to separate... */
953 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
954 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -0400955
Elly Jonese1749eb2011-10-07 13:54:59 -0400956 /* setenv() makes a copy of the string we give it */
957 setenv(kLdPreloadEnvVar, newenv, 1);
958 free(newenv);
959 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400960}
961
Will Drewry6ac91122011-10-21 16:38:58 -0500962int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -0400963{
964 int r = pipe(fds);
965 char fd_buf[11];
966 if (r)
967 return r;
968 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
969 if (r <= 0)
970 return -EINVAL;
971 setenv(kFdEnvVar, fd_buf, 1);
972 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500973}
974
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800975int setup_pipe_end(int fds[2], size_t index)
976{
977 if (index > 1)
978 return -1;
979
980 close(fds[1 - index]);
981 return fds[index];
982}
983
984int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
985{
986 if (index > 1)
987 return -1;
988
989 close(fds[1 - index]);
990 /* dup2(2) the corresponding end of the pipe into |fd|. */
991 return dup2(fds[index], fd);
992}
993
Will Drewry6ac91122011-10-21 16:38:58 -0500994int API minijail_run(struct minijail *j, const char *filename,
995 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -0400996{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -0800997 return minijail_run_pid_pipes(j, filename, argv,
998 NULL, NULL, NULL, NULL);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -0700999}
1000
1001int API minijail_run_pid(struct minijail *j, const char *filename,
1002 char *const argv[], pid_t *pchild_pid)
1003{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001004 return minijail_run_pid_pipes(j, filename, argv, pchild_pid,
1005 NULL, NULL, NULL);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001006}
1007
1008int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001009 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001010{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001011 return minijail_run_pid_pipes(j, filename, argv, NULL, pstdin_fd,
1012 NULL, NULL);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001013}
1014
1015int API minijail_run_pid_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001016 char *const argv[], pid_t *pchild_pid,
1017 int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001018{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001019 return minijail_run_pid_pipes(j, filename, argv, pchild_pid, pstdin_fd,
1020 NULL, NULL);
1021}
1022
1023int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001024 char *const argv[], pid_t *pchild_pid,
1025 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001026{
Elly Jonese1749eb2011-10-07 13:54:59 -04001027 char *oldenv, *oldenv_copy = NULL;
1028 pid_t child_pid;
1029 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001030 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001031 int stdout_fds[2];
1032 int stderr_fds[2];
Elly Jonese1749eb2011-10-07 13:54:59 -04001033 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001034 /* We need to remember this across the minijail_preexec() call. */
1035 int pid_namespace = j->flags.pids;
Ben Chan541c7e52011-08-26 14:55:53 -07001036
Elly Jonese1749eb2011-10-07 13:54:59 -04001037 oldenv = getenv(kLdPreloadEnvVar);
1038 if (oldenv) {
1039 oldenv_copy = strdup(oldenv);
1040 if (!oldenv_copy)
1041 return -ENOMEM;
1042 }
Will Drewryf89aef52011-09-16 16:48:57 -05001043
Elly Jonese1749eb2011-10-07 13:54:59 -04001044 if (setup_preload())
1045 return -EFAULT;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001046
Elly Jonesdd3e8512012-01-23 15:13:38 -05001047 /*
1048 * Before we fork(2) and execve(2) the child process, we need to open
Elly Jonese1749eb2011-10-07 13:54:59 -04001049 * a pipe(2) to send the minijail configuration over.
1050 */
1051 if (setup_pipe(pipe_fds))
1052 return -EFAULT;
Elly Jonescd7a9042011-07-22 13:56:51 -04001053
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001054 /*
1055 * If we want to write to the child process' standard input,
1056 * create the pipe(2) now.
1057 */
1058 if (pstdin_fd) {
1059 if (pipe(stdin_fds))
1060 return -EFAULT;
1061 }
1062
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001063 /*
1064 * If we want to read from the child process' standard output,
1065 * create the pipe(2) now.
1066 */
1067 if (pstdout_fd) {
1068 if (pipe(stdout_fds))
1069 return -EFAULT;
1070 }
1071
1072 /*
1073 * If we want to read from the child process' standard error,
1074 * create the pipe(2) now.
1075 */
1076 if (pstderr_fd) {
1077 if (pipe(stderr_fds))
1078 return -EFAULT;
1079 }
1080
Elly Jones761b7412012-06-13 15:49:52 -04001081 /* Use sys_clone() if and only if we're creating a pid namespace.
1082 *
1083 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1084 *
1085 * In multithreaded programs, there are a bunch of locks inside libc,
1086 * some of which may be held by other threads at the time that we call
1087 * minijail_run_pid(). If we call fork(), glibc does its level best to
1088 * ensure that we hold all of these locks before it calls clone()
1089 * internally and drop them after clone() returns, but when we call
1090 * sys_clone(2) directly, all that gets bypassed and we end up with a
1091 * child address space where some of libc's important locks are held by
1092 * other threads (which did not get cloned, and hence will never release
1093 * those locks). This is okay so long as we call exec() immediately
1094 * after, but a bunch of seemingly-innocent libc functions like setenv()
1095 * take locks.
1096 *
1097 * Hence, only call sys_clone() if we need to, in order to get at pid
1098 * namespacing. If we follow this path, the child's address space might
1099 * have broken locks; you may only call functions that do not acquire
1100 * any locks.
1101 *
1102 * Unfortunately, fork() acquires every lock it can get its hands on, as
1103 * previously detailed, so this function is highly likely to deadlock
1104 * later on (see "deadlock here") if we're multithreaded.
1105 *
1106 * We might hack around this by having the clone()d child (init of the
1107 * pid namespace) return directly, rather than leaving the clone()d
1108 * process hanging around to be init for the new namespace (and having
1109 * its fork()ed child return in turn), but that process would be crippled
1110 * with its libc locks potentially broken. We might try fork()ing in the
1111 * parent before we clone() to ensure that we own all the locks, but
1112 * then we have to have the forked child hanging around consuming
1113 * resources (and possibly having file descriptors / shared memory
1114 * regions / etc attached). We'd need to keep the child around to avoid
1115 * having its children get reparented to init.
1116 *
1117 * TODO(ellyjones): figure out if the "forked child hanging around"
1118 * problem is fixable or not. It would be nice if we worked in this
1119 * case.
1120 */
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001121 if (pid_namespace)
Elly Jones761b7412012-06-13 15:49:52 -04001122 child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1123 else
1124 child_pid = fork();
1125
Elly Jonese1749eb2011-10-07 13:54:59 -04001126 if (child_pid < 0) {
1127 free(oldenv_copy);
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001128 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001129 }
Will Drewryf89aef52011-09-16 16:48:57 -05001130
Elly Jonese1749eb2011-10-07 13:54:59 -04001131 if (child_pid) {
1132 /* Restore parent's LD_PRELOAD. */
1133 if (oldenv_copy) {
1134 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1135 free(oldenv_copy);
1136 } else {
1137 unsetenv(kLdPreloadEnvVar);
1138 }
1139 unsetenv(kFdEnvVar);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001140
Elly Jonese1749eb2011-10-07 13:54:59 -04001141 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001142
1143 /* Send marshalled minijail. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001144 close(pipe_fds[0]); /* read endpoint */
1145 ret = minijail_to_fd(j, pipe_fds[1]);
1146 close(pipe_fds[1]); /* write endpoint */
1147 if (ret) {
1148 kill(j->initpid, SIGKILL);
1149 die("failed to send marshalled minijail");
1150 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001151
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001152 if (pchild_pid)
1153 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001154
1155 /*
1156 * If we want to write to the child process' standard input,
1157 * set up the write end of the pipe.
1158 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001159 if (pstdin_fd)
1160 *pstdin_fd = setup_pipe_end(stdin_fds,
1161 1 /* write end */);
1162
1163 /*
1164 * If we want to read from the child process' standard output,
1165 * set up the read end of the pipe.
1166 */
1167 if (pstdout_fd)
1168 *pstdout_fd = setup_pipe_end(stdout_fds,
1169 0 /* read end */);
1170
1171 /*
1172 * If we want to read from the child process' standard error,
1173 * set up the read end of the pipe.
1174 */
1175 if (pstderr_fd)
1176 *pstderr_fd = setup_pipe_end(stderr_fds,
1177 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001178
Elly Jonese1749eb2011-10-07 13:54:59 -04001179 return 0;
1180 }
1181 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001182
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001183 /*
1184 * If we want to write to the jailed process' standard input,
1185 * set up the read end of the pipe.
1186 */
1187 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001188 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1189 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001190 die("failed to set up stdin pipe");
1191 }
1192
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001193 /*
1194 * If we want to read from the jailed process' standard output,
1195 * set up the write end of the pipe.
1196 */
1197 if (pstdout_fd) {
1198 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1199 STDOUT_FILENO) < 0)
1200 die("failed to set up stdout pipe");
1201 }
1202
1203 /*
1204 * If we want to read from the jailed process' standard error,
1205 * set up the write end of the pipe.
1206 */
1207 if (pstderr_fd) {
1208 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1209 STDERR_FILENO) < 0)
1210 die("failed to set up stderr pipe");
1211 }
1212
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001213 /* Strip out flags that cannot be inherited across execve. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001214 minijail_preexec(j);
1215 /* Jail this process and its descendants... */
1216 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001217
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001218 if (pid_namespace) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001219 /*
1220 * pid namespace: this process will become init inside the new
Elly Jonese1749eb2011-10-07 13:54:59 -04001221 * namespace, so fork off a child to actually run the program
1222 * (we don't want all programs we might exec to have to know
1223 * how to be init).
Elly Jones761b7412012-06-13 15:49:52 -04001224 *
1225 * If we're multithreaded, we'll probably deadlock here. See
1226 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001227 */
1228 child_pid = fork();
1229 if (child_pid < 0)
1230 _exit(child_pid);
1231 else if (child_pid > 0)
1232 init(child_pid); /* never returns */
1233 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001234
Elly Jonesdd3e8512012-01-23 15:13:38 -05001235 /*
1236 * If we aren't pid-namespaced:
Elly Jonese1749eb2011-10-07 13:54:59 -04001237 * calling process
1238 * -> execve()-ing process
1239 * If we are:
1240 * calling process
1241 * -> init()-ing process
1242 * -> execve()-ing process
1243 */
1244 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001245}
1246
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001247int API minijail_run_static(struct minijail *j, const char *filename,
1248 char *const argv[])
1249{
1250 pid_t child_pid;
1251 int pid_namespace = j->flags.pids;
1252
1253 if (j->flags.caps)
1254 die("caps not supported with static targets");
1255
1256 if (pid_namespace)
1257 child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
1258 else
1259 child_pid = fork();
1260
1261 if (child_pid < 0) {
1262 die("failed to fork child");
1263 }
1264 if (child_pid > 0 ) {
1265 j->initpid = child_pid;
1266 return 0;
1267 }
1268
1269 /*
1270 * We can now drop this child into the sandbox
1271 * then execve the target.
1272 */
1273
1274 j->flags.pids = 0;
1275 minijail_enter(j);
1276
1277 if (pid_namespace) {
1278 /*
1279 * pid namespace: this process will become init inside the new
1280 * namespace, so fork off a child to actually run the program
1281 * (we don't want all programs we might exec to have to know
1282 * how to be init).
1283 *
1284 * If we're multithreaded, we'll probably deadlock here. See
1285 * WARNING above.
1286 */
1287 child_pid = fork();
1288 if (child_pid < 0)
1289 _exit(child_pid);
1290 else if (child_pid > 0)
1291 init(child_pid); /* never returns */
1292 }
1293
1294 _exit(execve(filename, argv, environ));
1295}
1296
Will Drewry6ac91122011-10-21 16:38:58 -05001297int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001298{
1299 int st;
1300 if (kill(j->initpid, SIGTERM))
1301 return -errno;
1302 if (waitpid(j->initpid, &st, 0) < 0)
1303 return -errno;
1304 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001305}
1306
Will Drewry6ac91122011-10-21 16:38:58 -05001307int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001308{
1309 int st;
1310 if (waitpid(j->initpid, &st, 0) < 0)
1311 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001312
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001313 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001314 int error_status = st;
1315 if (WIFSIGNALED(st)) {
1316 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001317 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001318 j->initpid, signum);
1319 /*
1320 * We return MINIJAIL_ERR_JAIL if the process received
1321 * SIGSYS, which happens when a syscall is blocked by
1322 * seccomp filters.
1323 * If not, we do what bash(1) does:
1324 * $? = 128 + signum
1325 */
1326 if (signum == SIGSYS) {
1327 error_status = MINIJAIL_ERR_JAIL;
1328 } else {
1329 error_status = 128 + signum;
1330 }
1331 }
1332 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001333 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001334
1335 int exit_status = WEXITSTATUS(st);
1336 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001337 info("child process %d exited with status %d",
1338 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001339
1340 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001341}
1342
Will Drewry6ac91122011-10-21 16:38:58 -05001343void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001344{
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001345 if (j->flags.seccomp_filter && j->filter_prog) {
1346 free(j->filter_prog->filter);
1347 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001348 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001349 while (j->bindings_head) {
1350 struct binding *b = j->bindings_head;
1351 j->bindings_head = j->bindings_head->next;
1352 free(b->dest);
1353 free(b->src);
1354 free(b);
1355 }
1356 j->bindings_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001357 if (j->user)
1358 free(j->user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001359 if (j->chrootdir)
1360 free(j->chrootdir);
Elly Jonese1749eb2011-10-07 13:54:59 -04001361 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001362}