blob: b170db033ab594f11e8cd9fc13de94bf2ad5e228 [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07008
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08009#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050010#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040011#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070012#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <grp.h>
14#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050015#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050020#include <stdarg.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080021#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040022#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
25#include <syscall.h>
26#include <sys/capability.h>
27#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050028#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040029#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070030#include <sys/stat.h>
31#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080032#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040033#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040034#include <unistd.h>
35
36#include "libminijail.h"
37#include "libminijail-private.h"
38
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070039#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080040#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070041#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080042
Lei Zhangeee31552012-10-17 21:27:10 -070043#ifdef HAVE_SECUREBITS_H
44#include <linux/securebits.h>
45#else
46#define SECURE_ALL_BITS 0x15
47#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
48#endif
49
Will Drewry32ac9f52011-08-18 21:36:27 -050050/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080051#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070052# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080053#endif
54
55/* For seccomp_filter using BPF. */
56#ifndef PR_SET_NO_NEW_PRIVS
57# define PR_SET_NO_NEW_PRIVS 38
58#endif
59#ifndef SECCOMP_MODE_FILTER
60# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050061#endif
62
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -070063#ifdef USE_SECCOMP_SOFTFAIL
64# define SECCOMP_SOFTFAIL 1
65#else
66# define SECCOMP_SOFTFAIL 0
67#endif
68
Elly Jones51a5b6c2011-10-12 19:09:26 -040069struct binding {
70 char *src;
71 char *dest;
72 int writeable;
73 struct binding *next;
74};
75
Will Drewryf89aef52011-09-16 16:48:57 -050076struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070077 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070078 * WARNING: if you add a flag here you need to make sure it's
79 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070080 */
Elly Jonese1749eb2011-10-07 13:54:59 -040081 struct {
82 int uid:1;
83 int gid:1;
84 int caps:1;
85 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070086 int enter_vfs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040087 int pids:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -040088 int net:1;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +080089 int userns:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040090 int seccomp:1;
91 int readonly:1;
92 int usergroups:1;
93 int ptrace:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070094 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040095 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070096 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -040097 int chroot:1;
Lee Campbell11af0622014-05-22 12:36:04 -070098 int mount_tmp:1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +080099 int do_init:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400100 } flags;
101 uid_t uid;
102 gid_t gid;
103 gid_t usergid;
104 char *user;
105 uint64_t caps;
106 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700107 int mountns_fd;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800108 int filter_len;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400109 int binding_count;
110 char *chrootdir;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800111 char *uidmap;
112 char *gidmap;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800113 struct sock_fprog *filter_prog;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400114 struct binding *bindings_head;
115 struct binding *bindings_tail;
Will Drewryf89aef52011-09-16 16:48:57 -0500116};
117
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700118/*
119 * Strip out flags meant for the parent.
120 * We keep things that are not inherited across execve(2) (e.g. capabilities),
121 * or are easier to set after execve(2) (e.g. seccomp filters).
122 */
123void minijail_preenter(struct minijail *j)
124{
125 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700126 j->flags.enter_vfs = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700127 j->flags.readonly = 0;
128 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800129 j->flags.do_init = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700130}
131
132/*
133 * Strip out flags meant for the child.
134 * We keep things that are inherited across execve(2).
135 */
136void minijail_preexec(struct minijail *j)
137{
138 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700139 int enter_vfs = j->flags.enter_vfs;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700140 int readonly = j->flags.readonly;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800141 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700142 if (j->user)
143 free(j->user);
144 j->user = NULL;
145 memset(&j->flags, 0, sizeof(j->flags));
146 /* Now restore anything we meant to keep. */
147 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700148 j->flags.enter_vfs = enter_vfs;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700149 j->flags.readonly = readonly;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800150 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700151 /* Note, |pids| will already have been used before this call. */
152}
153
154/* Minijail API. */
155
Will Drewry6ac91122011-10-21 16:38:58 -0500156struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400157{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400158 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400159}
160
Will Drewry6ac91122011-10-21 16:38:58 -0500161void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400162{
163 if (uid == 0)
164 die("useless change to uid 0");
165 j->uid = uid;
166 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400167}
168
Will Drewry6ac91122011-10-21 16:38:58 -0500169void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400170{
171 if (gid == 0)
172 die("useless change to gid 0");
173 j->gid = gid;
174 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400175}
176
Will Drewry6ac91122011-10-21 16:38:58 -0500177int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400178{
179 char *buf = NULL;
180 struct passwd pw;
181 struct passwd *ppw = NULL;
182 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
183 if (sz == -1)
184 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400185
Elly Jonesdd3e8512012-01-23 15:13:38 -0500186 /*
187 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400188 * the maximum needed size of the buffer, so we don't have to search.
189 */
190 buf = malloc(sz);
191 if (!buf)
192 return -ENOMEM;
193 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500194 /*
195 * We're safe to free the buffer here. The strings inside pw point
196 * inside buf, but we don't use any of them; this leaves the pointers
197 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
198 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400199 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700200 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400201 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700202 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400203 minijail_change_uid(j, ppw->pw_uid);
204 j->user = strdup(user);
205 if (!j->user)
206 return -ENOMEM;
207 j->usergid = ppw->pw_gid;
208 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400209}
210
Will Drewry6ac91122011-10-21 16:38:58 -0500211int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400212{
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -0700213 char *buf = NULL;
Yabin Cui1b21c8f2015-07-22 10:34:45 -0700214 struct group gr;
215 struct group *pgr = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400216 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
217 if (sz == -1)
218 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400219
Elly Jonesdd3e8512012-01-23 15:13:38 -0500220 /*
221 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400222 * the maximum needed size of the buffer, so we don't have to search.
223 */
224 buf = malloc(sz);
225 if (!buf)
226 return -ENOMEM;
227 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500228 /*
229 * We're safe to free the buffer here. The strings inside gr point
230 * inside buf, but we don't use any of them; this leaves the pointers
231 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
232 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400233 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700234 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400235 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700236 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400237 minijail_change_gid(j, pgr->gr_gid);
238 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400239}
240
Will Drewry6ac91122011-10-21 16:38:58 -0500241void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400242{
243 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400244}
245
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700246void API minijail_no_new_privs(struct minijail *j)
247{
248 j->flags.no_new_privs = 1;
249}
250
Will Drewry6ac91122011-10-21 16:38:58 -0500251void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400252{
253 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500254}
255
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700256void API minijail_log_seccomp_filter_failures(struct minijail *j)
257{
258 j->flags.log_seccomp_filter = 1;
259}
260
Will Drewry6ac91122011-10-21 16:38:58 -0500261void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400262{
263 j->caps = capmask;
264 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400265}
266
Will Drewry6ac91122011-10-21 16:38:58 -0500267void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400268{
269 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400270}
271
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700272void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
273{
274 int ns_fd = open(ns_path, O_RDONLY);
275 if (ns_fd < 0) {
276 pdie("failed to open namespace '%s'", ns_path);
277 }
278 j->mountns_fd = ns_fd;
279 j->flags.enter_vfs = 1;
280}
281
Will Drewry6ac91122011-10-21 16:38:58 -0500282void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400283{
Elly Jonese58176c2012-01-23 11:46:17 -0500284 j->flags.vfs = 1;
285 j->flags.readonly = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400286 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800287 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400288}
289
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400290void API minijail_namespace_net(struct minijail *j)
291{
292 j->flags.net = 1;
293}
294
Will Drewry6ac91122011-10-21 16:38:58 -0500295void API minijail_remount_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400296{
297 j->flags.vfs = 1;
298 j->flags.readonly = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400299}
300
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800301void API minijail_namespace_user(struct minijail *j)
302{
303 j->flags.userns = 1;
304}
305
306int API minijail_uidmap(struct minijail *j, const char *uidmap)
307{
308 j->uidmap = strdup(uidmap);
309 if (!j->uidmap)
310 return -ENOMEM;
311 return 0;
312}
313
314int API minijail_gidmap(struct minijail *j, const char *gidmap)
315{
316 j->gidmap = strdup(gidmap);
317 if (!j->gidmap)
318 return -ENOMEM;
319 return 0;
320}
321
Will Drewry6ac91122011-10-21 16:38:58 -0500322void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400323{
324 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400325}
326
Will Drewry6ac91122011-10-21 16:38:58 -0500327void API minijail_disable_ptrace(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400328{
329 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400330}
331
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800332void API minijail_run_as_init(struct minijail *j)
333{
334 /*
335 * Since the jailed program will become 'init' in the new PID namespace,
336 * Minijail does not need to fork an 'init' process.
337 */
338 j->flags.do_init = 0;
339}
340
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700341int API minijail_enter_chroot(struct minijail *j, const char *dir)
342{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400343 if (j->chrootdir)
344 return -EINVAL;
345 j->chrootdir = strdup(dir);
346 if (!j->chrootdir)
347 return -ENOMEM;
348 j->flags.chroot = 1;
349 return 0;
350}
351
Lee Campbell11af0622014-05-22 12:36:04 -0700352void API minijail_mount_tmp(struct minijail *j)
353{
354 j->flags.mount_tmp = 1;
355}
356
Will Drewry6ac91122011-10-21 16:38:58 -0500357int API minijail_bind(struct minijail *j, const char *src, const char *dest,
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700358 int writeable)
359{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400360 struct binding *b;
361
362 if (*dest != '/')
363 return -EINVAL;
364 b = calloc(1, sizeof(*b));
365 if (!b)
366 return -ENOMEM;
367 b->dest = strdup(dest);
368 if (!b->dest)
369 goto error;
370 b->src = strdup(src);
371 if (!b->src)
372 goto error;
373 b->writeable = writeable;
374
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700375 info("bind %s -> %s", src, dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400376
Elly Jonesdd3e8512012-01-23 15:13:38 -0500377 /*
378 * Force vfs namespacing so the bind mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400379 * containing vfs namespace.
380 */
381 minijail_namespace_vfs(j);
382
383 if (j->bindings_tail)
384 j->bindings_tail->next = b;
385 else
386 j->bindings_head = b;
387 j->bindings_tail = b;
388 j->binding_count++;
389
390 return 0;
391
392error:
393 free(b->src);
394 free(b->dest);
395 free(b);
396 return -ENOMEM;
397}
398
Will Drewry6ac91122011-10-21 16:38:58 -0500399void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400400{
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700401 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
402 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
403 warn("not loading seccomp filter, seccomp not supported");
404 return;
405 }
406 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400407 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800408 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700409 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400410 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800411
412 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700413 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
414 die("failed to compile seccomp filter BPF program in '%s'",
415 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800416 }
417
418 j->filter_len = fprog->len;
419 j->filter_prog = fprog;
420
Elly Jonese1749eb2011-10-07 13:54:59 -0400421 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500422}
423
Will Drewryf89aef52011-09-16 16:48:57 -0500424struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400425 size_t available;
426 size_t total;
427 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500428};
429
Will Drewry6ac91122011-10-21 16:38:58 -0500430void marshal_state_init(struct marshal_state *state,
431 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400432{
433 state->available = available;
434 state->buf = buf;
435 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500436}
437
Will Drewry6ac91122011-10-21 16:38:58 -0500438void marshal_append(struct marshal_state *state,
439 char *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400440{
441 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500442
Elly Jonese1749eb2011-10-07 13:54:59 -0400443 /* Up to |available| will be written. */
444 if (copy_len) {
445 memcpy(state->buf, src, copy_len);
446 state->buf += copy_len;
447 state->available -= copy_len;
448 }
449 /* |total| will contain the expected length. */
450 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500451}
452
Will Drewry6ac91122011-10-21 16:38:58 -0500453void minijail_marshal_helper(struct marshal_state *state,
454 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400455{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400456 struct binding *b = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400457 marshal_append(state, (char *)j, sizeof(*j));
458 if (j->user)
459 marshal_append(state, j->user, strlen(j->user) + 1);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400460 if (j->chrootdir)
461 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800462 if (j->flags.seccomp_filter && j->filter_prog) {
463 struct sock_fprog *fp = j->filter_prog;
464 marshal_append(state, (char *)fp->filter,
465 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400466 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400467 for (b = j->bindings_head; b; b = b->next) {
468 marshal_append(state, b->src, strlen(b->src) + 1);
469 marshal_append(state, b->dest, strlen(b->dest) + 1);
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700470 marshal_append(state, (char *)&b->writeable,
471 sizeof(b->writeable));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400472 }
Will Drewryf89aef52011-09-16 16:48:57 -0500473}
474
Will Drewry6ac91122011-10-21 16:38:58 -0500475size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400476{
477 struct marshal_state state;
478 marshal_state_init(&state, NULL, 0);
479 minijail_marshal_helper(&state, j);
480 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500481}
482
Elly Jonese1749eb2011-10-07 13:54:59 -0400483int minijail_marshal(const struct minijail *j, char *buf, size_t available)
484{
485 struct marshal_state state;
486 marshal_state_init(&state, buf, available);
487 minijail_marshal_helper(&state, j);
488 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500489}
490
Elly Jones51a5b6c2011-10-12 19:09:26 -0400491/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
492 * @length Number of bytes to consume
493 * @buf Buffer to consume from
494 * @buflength Size of @buf
495 *
496 * Returns a pointer to the base of the bytes, or NULL for errors.
497 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700498void *consumebytes(size_t length, char **buf, size_t *buflength)
499{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400500 char *p = *buf;
501 if (length > *buflength)
502 return NULL;
503 *buf += length;
504 *buflength -= length;
505 return p;
506}
507
508/* consumestr: consumes a C string from a buffer @buf of length @length
509 * @buf Buffer to consume
510 * @length Length of buffer
511 *
512 * Returns a pointer to the base of the string, or NULL for errors.
513 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700514char *consumestr(char **buf, size_t *buflength)
515{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400516 size_t len = strnlen(*buf, *buflength);
517 if (len == *buflength)
518 /* There's no null-terminator */
519 return NULL;
520 return consumebytes(len + 1, buf, buflength);
521}
522
Elly Jonese1749eb2011-10-07 13:54:59 -0400523int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
524{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400525 int i;
526 int count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500527 int ret = -EINVAL;
528
Elly Jonese1749eb2011-10-07 13:54:59 -0400529 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500530 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400531 memcpy((void *)j, serialized, sizeof(*j));
532 serialized += sizeof(*j);
533 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500534
Will Drewrybee7ba72011-10-21 20:47:01 -0500535 /* Potentially stale pointers not used as signals. */
536 j->bindings_head = NULL;
537 j->bindings_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800538 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500539
Elly Jonese1749eb2011-10-07 13:54:59 -0400540 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400541 char *user = consumestr(&serialized, &length);
542 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500543 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400544 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500545 if (!j->user)
546 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400547 }
Will Drewryf89aef52011-09-16 16:48:57 -0500548
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400549 if (j->chrootdir) { /* stale pointer */
550 char *chrootdir = consumestr(&serialized, &length);
551 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500552 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400553 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500554 if (!j->chrootdir)
555 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400556 }
557
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800558 if (j->flags.seccomp_filter && j->filter_len > 0) {
559 size_t ninstrs = j->filter_len;
560 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
561 ninstrs > USHRT_MAX)
562 goto bad_filters;
563
564 size_t program_len = ninstrs * sizeof(struct sock_filter);
565 void *program = consumebytes(program_len, &serialized, &length);
566 if (!program)
567 goto bad_filters;
568
569 j->filter_prog = malloc(sizeof(struct sock_fprog));
570 j->filter_prog->len = ninstrs;
571 j->filter_prog->filter = malloc(program_len);
572 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400573 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400574
575 count = j->binding_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400576 j->binding_count = 0;
577 for (i = 0; i < count; ++i) {
578 int *writeable;
579 const char *dest;
580 const char *src = consumestr(&serialized, &length);
581 if (!src)
Will Drewrybee7ba72011-10-21 20:47:01 -0500582 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400583 dest = consumestr(&serialized, &length);
584 if (!dest)
Will Drewrybee7ba72011-10-21 20:47:01 -0500585 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400586 writeable = consumebytes(sizeof(*writeable), &serialized, &length);
587 if (!writeable)
Will Drewrybee7ba72011-10-21 20:47:01 -0500588 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400589 if (minijail_bind(j, src, dest, *writeable))
Will Drewrybee7ba72011-10-21 20:47:01 -0500590 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400591 }
592
Elly Jonese1749eb2011-10-07 13:54:59 -0400593 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500594
595bad_bindings:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800596 if (j->flags.seccomp_filter && j->filter_len > 0) {
597 free(j->filter_prog->filter);
598 free(j->filter_prog);
599 }
Will Drewrybee7ba72011-10-21 20:47:01 -0500600bad_filters:
601 if (j->chrootdir)
602 free(j->chrootdir);
603bad_chrootdir:
604 if (j->user)
605 free(j->user);
606clear_pointers:
607 j->user = NULL;
608 j->chrootdir = NULL;
609out:
610 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500611}
612
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800613static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
614{
615 int fd, ret, len;
616 size_t sz;
617 char fname[32];
618 close(pipe_fds[0]);
619
620 sz = sizeof(fname);
621 if (j->uidmap) {
622 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
623 if (ret < 0 || ret >= sz)
624 die("failed to write file name of uid_map");
625 fd = open(fname, O_WRONLY);
626 if (fd < 0)
627 pdie("failed to open '%s'", fname);
628 len = strlen(j->uidmap);
629 if (write(fd, j->uidmap, len) < len)
630 die("failed to set uid_map");
631 close(fd);
632 }
633 if (j->gidmap) {
634 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
635 if (ret < 0 || ret >= sz)
636 die("failed to write file name of gid_map");
637 fd = open(fname, O_WRONLY);
638 if (fd < 0)
639 pdie("failed to open '%s'", fname);
640 len = strlen(j->gidmap);
641 if (write(fd, j->gidmap, len) < len)
642 die("failed to set gid_map");
643 close(fd);
644 }
645
646 close(pipe_fds[1]);
647}
648
649static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
650{
651 char buf;
652
653 close(pipe_fds[1]);
654
655 /* Wait for parent to set up uid/gid mappings. */
656 if (read(pipe_fds[0], &buf, 1) != 0)
657 die("failed to sync with parent");
658 close(pipe_fds[0]);
659
660 if (j->uidmap && setresuid(0, 0, 0))
661 pdie("setresuid");
662 if (j->gidmap && setresgid(0, 0, 0))
663 pdie("setresgid");
664}
665
Elly Jones51a5b6c2011-10-12 19:09:26 -0400666/* bind_one: Applies bindings from @b for @j, recursing as needed.
667 * @j Minijail these bindings are for
668 * @b Head of list of bindings
669 *
670 * Returns 0 for success.
671 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700672int bind_one(const struct minijail *j, struct binding *b)
673{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400674 int ret = 0;
675 char *dest = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400676 if (ret)
677 return ret;
678 /* dest has a leading "/" */
679 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
680 return -ENOMEM;
Elly Jonesa1059632011-12-15 15:17:07 -0500681 ret = mount(b->src, dest, NULL, MS_BIND, NULL);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400682 if (ret)
683 pdie("bind: %s -> %s", b->src, dest);
Elly Jonesa1059632011-12-15 15:17:07 -0500684 if (!b->writeable) {
685 ret = mount(b->src, dest, NULL,
Jorge Lucangeli Obes2f61ee42014-06-16 11:08:18 -0700686 MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
Elly Jonesa1059632011-12-15 15:17:07 -0500687 if (ret)
688 pdie("bind ro: %s -> %s", b->src, dest);
689 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400690 free(dest);
691 if (b->next)
692 return bind_one(j, b->next);
693 return ret;
694}
695
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700696int enter_chroot(const struct minijail *j)
697{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400698 int ret;
699 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
700 return ret;
701
702 if (chroot(j->chrootdir))
703 return -errno;
704
705 if (chdir("/"))
706 return -errno;
707
708 return 0;
709}
710
Lee Campbell11af0622014-05-22 12:36:04 -0700711int mount_tmp(void)
712{
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -0800713 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
Lee Campbell11af0622014-05-22 12:36:04 -0700714}
715
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800716int remount_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400717{
718 const char *kProcPath = "/proc";
719 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -0500720 /*
721 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -0400722 * /proc in our namespace, which means using MS_REMOUNT here would
723 * mutate our parent's mount as well, even though we're in a VFS
724 * namespace (!). Instead, remove their mount from our namespace
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800725 * and make our own. However, if we are in a new user namespace, /proc
726 * is not seen as mounted, so don't return error if umount() fails.
Elly Jonese1749eb2011-10-07 13:54:59 -0400727 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800728 if (umount(kProcPath) && !j->flags.userns)
Elly Jonese1749eb2011-10-07 13:54:59 -0400729 return -errno;
730 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
731 return -errno;
732 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400733}
734
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700735void drop_ugid(const struct minijail *j)
736{
737 if (j->flags.usergroups) {
738 if (initgroups(j->user, j->usergid))
739 pdie("initgroups");
740 } else {
741 /* Only attempt to clear supplemental groups if we are changing
742 * users. */
743 if ((j->uid || j->gid) && setgroups(0, NULL))
744 pdie("setgroups");
745 }
746
747 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
748 pdie("setresgid");
749
750 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
751 pdie("setresuid");
752}
753
Mike Frysinger3adfef72013-05-09 17:19:08 -0400754/*
755 * We specifically do not use cap_valid() as that only tells us the last
756 * valid cap we were *compiled* against (i.e. what the version of kernel
757 * headers says). If we run on a different kernel version, then it's not
758 * uncommon for that to be less (if an older kernel) or more (if a newer
759 * kernel). So suck up the answer via /proc.
760 */
761static int run_cap_valid(unsigned int cap)
762{
763 static unsigned int last_cap;
764
765 if (!last_cap) {
766 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
767 FILE *fp = fopen(cap_file, "re");
768 if (fscanf(fp, "%u", &last_cap) != 1)
769 pdie("fscanf(%s)", cap_file);
770 fclose(fp);
771 }
772
773 return cap <= last_cap;
774}
775
Will Drewry6ac91122011-10-21 16:38:58 -0500776void drop_caps(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400777{
778 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -0800779 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -0800780 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400781 unsigned int i;
782 if (!caps)
783 die("can't get process caps");
784 if (cap_clear_flag(caps, CAP_INHERITABLE))
785 die("can't clear inheritable caps");
786 if (cap_clear_flag(caps, CAP_EFFECTIVE))
787 die("can't clear effective caps");
788 if (cap_clear_flag(caps, CAP_PERMITTED))
789 die("can't clear permitted caps");
Mike Frysinger3adfef72013-05-09 17:19:08 -0400790 for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
Kees Cook323878a2013-02-05 15:35:24 -0800791 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800792 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -0400793 continue;
Kees Cook323878a2013-02-05 15:35:24 -0800794 flag[0] = i;
795 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400796 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -0800797 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400798 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -0800799 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400800 die("can't add inheritable cap");
801 }
802 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -0800803 die("can't apply initial cleaned capset");
804
805 /*
806 * Instead of dropping bounding set first, do it here in case
807 * the caller had a more permissive bounding set which could
808 * have been used above to raise a capability that wasn't already
809 * present. This requires CAP_SETPCAP, so we raised/kept it above.
810 */
Mike Frysinger3adfef72013-05-09 17:19:08 -0400811 for (i = 0; i < sizeof(j->caps) * 8 && run_cap_valid(i); ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -0800812 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -0400813 continue;
814 if (prctl(PR_CAPBSET_DROP, i))
815 pdie("prctl(PR_CAPBSET_DROP)");
816 }
Kees Cook323878a2013-02-05 15:35:24 -0800817
818 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800819 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -0800820 flag[0] = CAP_SETPCAP;
821 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
822 die("can't clear effective cap");
823 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
824 die("can't clear permitted cap");
825 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
826 die("can't clear inheritable cap");
827 }
828
829 if (cap_set_proc(caps))
830 die("can't apply final cleaned capset");
831
832 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -0400833}
834
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700835void set_seccomp_filter(const struct minijail *j)
836{
837 /*
838 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
839 * in the kernel source tree for an explanation of the parameters.
840 */
841 if (j->flags.no_new_privs) {
842 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
843 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
844 }
845
846 /*
847 * If we're logging seccomp filter failures,
848 * install the SIGSYS handler first.
849 */
850 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
851 if (install_sigsys_handler())
852 pdie("install SIGSYS handler");
853 warn("logging seccomp filter failures");
854 }
855
856 /*
857 * Install the syscall filter.
858 */
859 if (j->flags.seccomp_filter) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700860 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
861 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
862 warn("seccomp not supported");
863 return;
864 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700865 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700866 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700867 }
868}
869
Will Drewry6ac91122011-10-21 16:38:58 -0500870void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400871{
872 if (j->flags.pids)
873 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700874 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -0400875
Elly Jonese1749eb2011-10-07 13:54:59 -0400876 if (j->flags.usergroups && !j->user)
877 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -0400878
Elly Jonesdd3e8512012-01-23 15:13:38 -0500879 /*
880 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -0400881 * so we don't even try. If any of our operations fail, we abort() the
882 * entire process.
883 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700884 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
885 pdie("setns(CLONE_NEWNS)");
886
Elly Jonese1749eb2011-10-07 13:54:59 -0400887 if (j->flags.vfs && unshare(CLONE_NEWNS))
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400888 pdie("unshare(vfs)");
889
890 if (j->flags.net && unshare(CLONE_NEWNET))
891 pdie("unshare(net)");
Elly Jonescd7a9042011-07-22 13:56:51 -0400892
Elly Jones51a5b6c2011-10-12 19:09:26 -0400893 if (j->flags.chroot && enter_chroot(j))
894 pdie("chroot");
895
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -0800896 if (j->flags.mount_tmp && mount_tmp())
Lee Campbell11af0622014-05-22 12:36:04 -0700897 pdie("mount_tmp");
898
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800899 if (j->flags.readonly && remount_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -0400900 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -0400901
Elly Jonese1749eb2011-10-07 13:54:59 -0400902 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -0500903 /*
904 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -0400905 * capability to change uids, our attempt to use setuid()
906 * below will fail. Hang on to root caps across setuid(), then
907 * lock securebits.
908 */
909 if (prctl(PR_SET_KEEPCAPS, 1))
910 pdie("prctl(PR_SET_KEEPCAPS)");
911 if (prctl
912 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
913 pdie("prctl(PR_SET_SECUREBITS)");
914 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400915
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700916 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700917 * If we're setting no_new_privs, we can drop privileges
918 * before setting seccomp filter. This way filter policies
919 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700920 */
921 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700922 drop_ugid(j);
923 if (j->flags.caps)
924 drop_caps(j);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700925
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700926 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -0400927 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700928 /*
929 * If we're not setting no_new_privs,
930 * we need to set seccomp filter *before* dropping privileges.
931 * WARNING: this means that filter policies *must* allow
932 * setgroups()/setresgid()/setresuid() for dropping root and
933 * capget()/capset()/prctl() for dropping caps.
934 */
935 set_seccomp_filter(j);
936
937 drop_ugid(j);
938 if (j->flags.caps)
939 drop_caps(j);
Elly Jonese1749eb2011-10-07 13:54:59 -0400940 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400941
Elly Jonesdd3e8512012-01-23 15:13:38 -0500942 /*
943 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -0400944 * privilege-dropping syscalls :)
945 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700946 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
947 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
948 warn("seccomp not supported");
949 return;
950 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400951 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700952 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400953}
954
Will Drewry6ac91122011-10-21 16:38:58 -0500955/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -0400956static int init_exitstatus = 0;
957
Will Drewry6ac91122011-10-21 16:38:58 -0500958void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -0400959{
960 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -0400961}
962
Will Drewry6ac91122011-10-21 16:38:58 -0500963int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400964{
965 pid_t pid;
966 int status;
967 /* so that we exit with the right status */
968 signal(SIGTERM, init_term);
969 /* TODO(wad) self jail with seccomp_filters here. */
970 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -0500971 /*
972 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -0400973 * left inside our pid namespace or we get a signal.
974 */
975 if (pid == rootpid)
976 init_exitstatus = status;
977 }
978 if (!WIFEXITED(init_exitstatus))
979 _exit(MINIJAIL_ERR_INIT);
980 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -0400981}
982
Will Drewry6ac91122011-10-21 16:38:58 -0500983int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400984{
985 size_t sz = 0;
986 size_t bytes = read(fd, &sz, sizeof(sz));
987 char *buf;
988 int r;
989 if (sizeof(sz) != bytes)
990 return -EINVAL;
991 if (sz > USHRT_MAX) /* Arbitrary sanity check */
992 return -E2BIG;
993 buf = malloc(sz);
994 if (!buf)
995 return -ENOMEM;
996 bytes = read(fd, buf, sz);
997 if (bytes != sz) {
998 free(buf);
999 return -EINVAL;
1000 }
1001 r = minijail_unmarshal(j, buf, sz);
1002 free(buf);
1003 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001004}
1005
Will Drewry6ac91122011-10-21 16:38:58 -05001006int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04001007{
1008 char *buf;
1009 size_t sz = minijail_size(j);
1010 ssize_t written;
1011 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04001012
Elly Jonese1749eb2011-10-07 13:54:59 -04001013 if (!sz)
1014 return -EINVAL;
1015 buf = malloc(sz);
1016 r = minijail_marshal(j, buf, sz);
1017 if (r) {
1018 free(buf);
1019 return r;
1020 }
1021 /* Sends [size][minijail]. */
1022 written = write(fd, &sz, sizeof(sz));
1023 if (written != sizeof(sz)) {
1024 free(buf);
1025 return -EFAULT;
1026 }
1027 written = write(fd, buf, sz);
1028 if (written < 0 || (size_t) written != sz) {
1029 free(buf);
1030 return -EFAULT;
1031 }
1032 free(buf);
1033 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001034}
Elly Jonescd7a9042011-07-22 13:56:51 -04001035
Will Drewry6ac91122011-10-21 16:38:58 -05001036int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04001037{
Daniel Erat5b7a3182015-08-19 16:06:22 -06001038#if defined(__ANDROID__)
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001039 /* Don't use LDPRELOAD on Brillo. */
1040 return 0;
1041#else
Elly Jonese1749eb2011-10-07 13:54:59 -04001042 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1043 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1044 if (!newenv)
1045 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04001046
Elly Jonese1749eb2011-10-07 13:54:59 -04001047 /* Only insert a separating space if we have something to separate... */
1048 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1049 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04001050
Elly Jonese1749eb2011-10-07 13:54:59 -04001051 /* setenv() makes a copy of the string we give it */
1052 setenv(kLdPreloadEnvVar, newenv, 1);
1053 free(newenv);
1054 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001055#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04001056}
1057
Will Drewry6ac91122011-10-21 16:38:58 -05001058int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04001059{
1060 int r = pipe(fds);
1061 char fd_buf[11];
1062 if (r)
1063 return r;
1064 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1065 if (r <= 0)
1066 return -EINVAL;
1067 setenv(kFdEnvVar, fd_buf, 1);
1068 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001069}
1070
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001071int setup_pipe_end(int fds[2], size_t index)
1072{
1073 if (index > 1)
1074 return -1;
1075
1076 close(fds[1 - index]);
1077 return fds[index];
1078}
1079
1080int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1081{
1082 if (index > 1)
1083 return -1;
1084
1085 close(fds[1 - index]);
1086 /* dup2(2) the corresponding end of the pipe into |fd|. */
1087 return dup2(fds[index], fd);
1088}
1089
Will Drewry6ac91122011-10-21 16:38:58 -05001090int API minijail_run(struct minijail *j, const char *filename,
1091 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04001092{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001093 return minijail_run_pid_pipes(j, filename, argv,
1094 NULL, NULL, NULL, NULL);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001095}
1096
1097int API minijail_run_pid(struct minijail *j, const char *filename,
1098 char *const argv[], pid_t *pchild_pid)
1099{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001100 return minijail_run_pid_pipes(j, filename, argv, pchild_pid,
1101 NULL, NULL, NULL);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001102}
1103
1104int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001105 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001106{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001107 return minijail_run_pid_pipes(j, filename, argv, NULL, pstdin_fd,
1108 NULL, NULL);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001109}
1110
1111int API minijail_run_pid_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001112 char *const argv[], pid_t *pchild_pid,
1113 int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001114{
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001115 return minijail_run_pid_pipes(j, filename, argv, pchild_pid, pstdin_fd,
1116 NULL, NULL);
1117}
1118
1119int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001120 char *const argv[], pid_t *pchild_pid,
1121 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001122{
Elly Jonese1749eb2011-10-07 13:54:59 -04001123 char *oldenv, *oldenv_copy = NULL;
1124 pid_t child_pid;
1125 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001126 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001127 int stdout_fds[2];
1128 int stderr_fds[2];
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001129 int userns_pipe_fds[2];
Elly Jonese1749eb2011-10-07 13:54:59 -04001130 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001131 /* We need to remember this across the minijail_preexec() call. */
1132 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001133 int do_init = j->flags.do_init;
Ben Chan541c7e52011-08-26 14:55:53 -07001134
Elly Jonese1749eb2011-10-07 13:54:59 -04001135 oldenv = getenv(kLdPreloadEnvVar);
1136 if (oldenv) {
1137 oldenv_copy = strdup(oldenv);
1138 if (!oldenv_copy)
1139 return -ENOMEM;
1140 }
Will Drewryf89aef52011-09-16 16:48:57 -05001141
Elly Jonese1749eb2011-10-07 13:54:59 -04001142 if (setup_preload())
1143 return -EFAULT;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001144
Elly Jonesdd3e8512012-01-23 15:13:38 -05001145 /*
Jorge Lucangeli Obes3c84df12015-05-14 17:37:58 -07001146 * Make the process group ID of this process equal to its PID, so that
1147 * both the Minijail process and the jailed process can be killed
1148 * together.
1149 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1150 * the process is already a process group leader.
1151 */
1152 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1153 if (errno != EPERM) {
1154 pdie("setpgid(0, 0)");
1155 }
1156 }
1157
1158 /*
Elly Jonesdd3e8512012-01-23 15:13:38 -05001159 * Before we fork(2) and execve(2) the child process, we need to open
Elly Jonese1749eb2011-10-07 13:54:59 -04001160 * a pipe(2) to send the minijail configuration over.
1161 */
1162 if (setup_pipe(pipe_fds))
1163 return -EFAULT;
Elly Jonescd7a9042011-07-22 13:56:51 -04001164
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001165 /*
1166 * If we want to write to the child process' standard input,
1167 * create the pipe(2) now.
1168 */
1169 if (pstdin_fd) {
1170 if (pipe(stdin_fds))
1171 return -EFAULT;
1172 }
1173
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001174 /*
1175 * If we want to read from the child process' standard output,
1176 * create the pipe(2) now.
1177 */
1178 if (pstdout_fd) {
1179 if (pipe(stdout_fds))
1180 return -EFAULT;
1181 }
1182
1183 /*
1184 * If we want to read from the child process' standard error,
1185 * create the pipe(2) now.
1186 */
1187 if (pstderr_fd) {
1188 if (pipe(stderr_fds))
1189 return -EFAULT;
1190 }
1191
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001192 /*
1193 * If we want to set up a new uid/gid mapping in the user namespace,
1194 * create the pipe(2) to sync between parent and child.
1195 */
1196 if (j->flags.userns) {
1197 if (pipe(userns_pipe_fds))
1198 return -EFAULT;
1199 }
1200
Elly Jones761b7412012-06-13 15:49:52 -04001201 /* Use sys_clone() if and only if we're creating a pid namespace.
1202 *
1203 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1204 *
1205 * In multithreaded programs, there are a bunch of locks inside libc,
1206 * some of which may be held by other threads at the time that we call
1207 * minijail_run_pid(). If we call fork(), glibc does its level best to
1208 * ensure that we hold all of these locks before it calls clone()
1209 * internally and drop them after clone() returns, but when we call
1210 * sys_clone(2) directly, all that gets bypassed and we end up with a
1211 * child address space where some of libc's important locks are held by
1212 * other threads (which did not get cloned, and hence will never release
1213 * those locks). This is okay so long as we call exec() immediately
1214 * after, but a bunch of seemingly-innocent libc functions like setenv()
1215 * take locks.
1216 *
1217 * Hence, only call sys_clone() if we need to, in order to get at pid
1218 * namespacing. If we follow this path, the child's address space might
1219 * have broken locks; you may only call functions that do not acquire
1220 * any locks.
1221 *
1222 * Unfortunately, fork() acquires every lock it can get its hands on, as
1223 * previously detailed, so this function is highly likely to deadlock
1224 * later on (see "deadlock here") if we're multithreaded.
1225 *
1226 * We might hack around this by having the clone()d child (init of the
1227 * pid namespace) return directly, rather than leaving the clone()d
1228 * process hanging around to be init for the new namespace (and having
1229 * its fork()ed child return in turn), but that process would be crippled
1230 * with its libc locks potentially broken. We might try fork()ing in the
1231 * parent before we clone() to ensure that we own all the locks, but
1232 * then we have to have the forked child hanging around consuming
1233 * resources (and possibly having file descriptors / shared memory
1234 * regions / etc attached). We'd need to keep the child around to avoid
1235 * having its children get reparented to init.
1236 *
1237 * TODO(ellyjones): figure out if the "forked child hanging around"
1238 * problem is fixable or not. It would be nice if we worked in this
1239 * case.
1240 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001241 if (pid_namespace) {
1242 int clone_flags = CLONE_NEWPID | SIGCHLD;
1243 if (j->flags.userns)
1244 clone_flags |= CLONE_NEWUSER;
1245 child_pid = syscall(SYS_clone, clone_flags, NULL);
1246 }
Elly Jones761b7412012-06-13 15:49:52 -04001247 else
1248 child_pid = fork();
1249
Elly Jonese1749eb2011-10-07 13:54:59 -04001250 if (child_pid < 0) {
1251 free(oldenv_copy);
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001252 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001253 }
Will Drewryf89aef52011-09-16 16:48:57 -05001254
Elly Jonese1749eb2011-10-07 13:54:59 -04001255 if (child_pid) {
1256 /* Restore parent's LD_PRELOAD. */
1257 if (oldenv_copy) {
1258 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1259 free(oldenv_copy);
1260 } else {
1261 unsetenv(kLdPreloadEnvVar);
1262 }
1263 unsetenv(kFdEnvVar);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001264
Elly Jonese1749eb2011-10-07 13:54:59 -04001265 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001266
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001267 if (j->flags.userns)
1268 write_ugid_mappings(j, userns_pipe_fds);
1269
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001270 /* Send marshalled minijail. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001271 close(pipe_fds[0]); /* read endpoint */
1272 ret = minijail_to_fd(j, pipe_fds[1]);
1273 close(pipe_fds[1]); /* write endpoint */
1274 if (ret) {
1275 kill(j->initpid, SIGKILL);
1276 die("failed to send marshalled minijail");
1277 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001278
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001279 if (pchild_pid)
1280 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001281
1282 /*
1283 * If we want to write to the child process' standard input,
1284 * set up the write end of the pipe.
1285 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001286 if (pstdin_fd)
1287 *pstdin_fd = setup_pipe_end(stdin_fds,
1288 1 /* write end */);
1289
1290 /*
1291 * If we want to read from the child process' standard output,
1292 * set up the read end of the pipe.
1293 */
1294 if (pstdout_fd)
1295 *pstdout_fd = setup_pipe_end(stdout_fds,
1296 0 /* read end */);
1297
1298 /*
1299 * If we want to read from the child process' standard error,
1300 * set up the read end of the pipe.
1301 */
1302 if (pstderr_fd)
1303 *pstderr_fd = setup_pipe_end(stderr_fds,
1304 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001305
Elly Jonese1749eb2011-10-07 13:54:59 -04001306 return 0;
1307 }
1308 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001309
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001310
1311 if (j->flags.userns)
1312 enter_user_namespace(j, userns_pipe_fds);
1313
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001314 /*
1315 * If we want to write to the jailed process' standard input,
1316 * set up the read end of the pipe.
1317 */
1318 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001319 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1320 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001321 die("failed to set up stdin pipe");
1322 }
1323
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001324 /*
1325 * If we want to read from the jailed process' standard output,
1326 * set up the write end of the pipe.
1327 */
1328 if (pstdout_fd) {
1329 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1330 STDOUT_FILENO) < 0)
1331 die("failed to set up stdout pipe");
1332 }
1333
1334 /*
1335 * If we want to read from the jailed process' standard error,
1336 * set up the write end of the pipe.
1337 */
1338 if (pstderr_fd) {
1339 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1340 STDERR_FILENO) < 0)
1341 die("failed to set up stderr pipe");
1342 }
1343
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001344 /* Strip out flags that cannot be inherited across execve. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001345 minijail_preexec(j);
1346 /* Jail this process and its descendants... */
1347 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001348
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001349 if (pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001350 /*
1351 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001352 * namespace. We don't want all programs we might exec to have
1353 * to know how to be init. Normally |do_init == 1| we fork off
1354 * a child to actually run the program. If |do_init == 0|, we
1355 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04001356 *
1357 * If we're multithreaded, we'll probably deadlock here. See
1358 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001359 */
1360 child_pid = fork();
1361 if (child_pid < 0)
1362 _exit(child_pid);
1363 else if (child_pid > 0)
1364 init(child_pid); /* never returns */
1365 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001366
Elly Jonesdd3e8512012-01-23 15:13:38 -05001367 /*
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001368 * If we aren't pid-namespaced, or jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04001369 * calling process
1370 * -> execve()-ing process
1371 * If we are:
1372 * calling process
1373 * -> init()-ing process
1374 * -> execve()-ing process
1375 */
1376 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001377}
1378
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001379int API minijail_run_static(struct minijail *j, const char *filename,
1380 char *const argv[])
1381{
1382 pid_t child_pid;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001383 int userns_pipe_fds[2];
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001384 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001385 int do_init = j->flags.do_init;
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001386
1387 if (j->flags.caps)
1388 die("caps not supported with static targets");
1389
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001390 /*
1391 * If we want to set up a new uid/gid mapping in the user namespace,
1392 * create the pipe(2) to sync between parent and child.
1393 */
1394 if (j->flags.userns) {
1395 if (pipe(userns_pipe_fds))
1396 return -EFAULT;
1397 }
1398
1399 if (pid_namespace) {
1400 int clone_flags = CLONE_NEWPID | SIGCHLD;
1401 if (j->flags.userns)
1402 clone_flags |= CLONE_NEWUSER;
1403 child_pid = syscall(SYS_clone, clone_flags, NULL);
1404 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001405 else
1406 child_pid = fork();
1407
1408 if (child_pid < 0) {
1409 die("failed to fork child");
1410 }
1411 if (child_pid > 0 ) {
1412 j->initpid = child_pid;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001413
1414 if (j->flags.userns)
1415 write_ugid_mappings(j, userns_pipe_fds);
1416
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001417 return 0;
1418 }
1419
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001420 if (j->flags.userns)
1421 enter_user_namespace(j, userns_pipe_fds);
1422
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001423 /*
1424 * We can now drop this child into the sandbox
1425 * then execve the target.
1426 */
1427
1428 j->flags.pids = 0;
1429 minijail_enter(j);
1430
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001431 if (pid_namespace && do_init) {
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001432 /*
1433 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001434 * namespace. We don't want all programs we might exec to have
1435 * to know how to be init. Normally |do_init == 1| we fork off
1436 * a child to actually run the program. If |do_init == 0|, we
1437 * let the program keep pid 1 and be init.
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001438 *
1439 * If we're multithreaded, we'll probably deadlock here. See
1440 * WARNING above.
1441 */
1442 child_pid = fork();
1443 if (child_pid < 0)
1444 _exit(child_pid);
1445 else if (child_pid > 0)
1446 init(child_pid); /* never returns */
1447 }
1448
1449 _exit(execve(filename, argv, environ));
1450}
1451
Will Drewry6ac91122011-10-21 16:38:58 -05001452int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001453{
1454 int st;
1455 if (kill(j->initpid, SIGTERM))
1456 return -errno;
1457 if (waitpid(j->initpid, &st, 0) < 0)
1458 return -errno;
1459 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001460}
1461
Will Drewry6ac91122011-10-21 16:38:58 -05001462int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001463{
1464 int st;
1465 if (waitpid(j->initpid, &st, 0) < 0)
1466 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001467
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001468 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001469 int error_status = st;
1470 if (WIFSIGNALED(st)) {
1471 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001472 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001473 j->initpid, signum);
1474 /*
1475 * We return MINIJAIL_ERR_JAIL if the process received
1476 * SIGSYS, which happens when a syscall is blocked by
1477 * seccomp filters.
1478 * If not, we do what bash(1) does:
1479 * $? = 128 + signum
1480 */
1481 if (signum == SIGSYS) {
1482 error_status = MINIJAIL_ERR_JAIL;
1483 } else {
1484 error_status = 128 + signum;
1485 }
1486 }
1487 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001488 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001489
1490 int exit_status = WEXITSTATUS(st);
1491 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001492 info("child process %d exited with status %d",
1493 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001494
1495 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001496}
1497
Will Drewry6ac91122011-10-21 16:38:58 -05001498void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001499{
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001500 if (j->flags.seccomp_filter && j->filter_prog) {
1501 free(j->filter_prog->filter);
1502 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001503 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001504 while (j->bindings_head) {
1505 struct binding *b = j->bindings_head;
1506 j->bindings_head = j->bindings_head->next;
1507 free(b->dest);
1508 free(b->src);
1509 free(b);
1510 }
1511 j->bindings_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001512 if (j->user)
1513 free(j->user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001514 if (j->chrootdir)
1515 free(j->chrootdir);
Elly Jonese1749eb2011-10-07 13:54:59 -04001516 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001517}