blob: 2cc655757b0ea316fb3235339aec636f1e2af7f3 [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07008
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08009#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050010#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040011#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070012#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <grp.h>
14#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050015#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050020#include <stdarg.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070021#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080022#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040023#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050029#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040030#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070031#include <sys/stat.h>
32#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080033#include <sys/user.h>
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -080034#include <sys/utsname.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040036#include <unistd.h>
37
38#include "libminijail.h"
39#include "libminijail-private.h"
40
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070041#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080042#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070043#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080044
Lei Zhangeee31552012-10-17 21:27:10 -070045#ifdef HAVE_SECUREBITS_H
46#include <linux/securebits.h>
47#else
48#define SECURE_ALL_BITS 0x15
49#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
50#endif
51
Will Drewry32ac9f52011-08-18 21:36:27 -050052/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080053#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070054# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080055#endif
56
Andrew Brestickereac28942015-11-11 16:04:46 -080057#ifndef PR_ALT_SYSCALL
58# define PR_ALT_SYSCALL 0x43724f53
59#endif
60
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080061/* For seccomp_filter using BPF. */
62#ifndef PR_SET_NO_NEW_PRIVS
63# define PR_SET_NO_NEW_PRIVS 38
64#endif
65#ifndef SECCOMP_MODE_FILTER
66# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050067#endif
68
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -070069#ifdef USE_SECCOMP_SOFTFAIL
70# define SECCOMP_SOFTFAIL 1
71#else
72# define SECCOMP_SOFTFAIL 0
73#endif
74
Dylan Reid605ce7f2016-01-19 19:21:00 -080075#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
76
Dylan Reid648b2202015-10-23 00:50:00 -070077struct mountpoint {
Elly Jones51a5b6c2011-10-12 19:09:26 -040078 char *src;
79 char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -070080 char *type;
81 unsigned long flags;
82 struct mountpoint *next;
Elly Jones51a5b6c2011-10-12 19:09:26 -040083};
84
Will Drewryf89aef52011-09-16 16:48:57 -050085struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070086 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070087 * WARNING: if you add a flag here you need to make sure it's
88 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070089 */
Elly Jonese1749eb2011-10-07 13:54:59 -040090 struct {
91 int uid:1;
92 int gid:1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -080093 int usergroups:1;
94 int suppl_gids:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040095 int caps:1;
96 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070097 int enter_vfs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040098 int pids:1;
Dylan Reidf7942472015-11-18 17:55:26 -080099 int ipc:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400100 int net:1;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700101 int enter_net:1;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800102 int userns:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400103 int seccomp:1;
Dylan Reid791f5772015-09-14 20:02:42 -0700104 int remount_proc_ro:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700105 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400106 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700107 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400108 int chroot:1;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800109 int pivot_root:1;
Lee Campbell11af0622014-05-22 12:36:04 -0700110 int mount_tmp:1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800111 int do_init:1;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800112 int pid_file:1;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800113 int cgroups:1;
Andrew Brestickereac28942015-11-11 16:04:46 -0800114 int alt_syscall:1;
Peter Qiu2860c462015-12-16 15:13:06 -0800115 int reset_signal_mask:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400116 } flags;
117 uid_t uid;
118 gid_t gid;
119 gid_t usergid;
120 char *user;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800121 size_t suppl_gid_count;
122 gid_t *suppl_gid_list;
Elly Jonese1749eb2011-10-07 13:54:59 -0400123 uint64_t caps;
124 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700125 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700126 int netns_fd;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400127 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800128 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800129 char *uidmap;
130 char *gidmap;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800131 size_t filter_len;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800132 struct sock_fprog *filter_prog;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800133 char *alt_syscall_table;
Dylan Reid648b2202015-10-23 00:50:00 -0700134 struct mountpoint *mounts_head;
135 struct mountpoint *mounts_tail;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800136 size_t mounts_count;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800137 char *cgroups[MAX_CGROUPS];
138 size_t cgroup_count;
Will Drewryf89aef52011-09-16 16:48:57 -0500139};
140
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700141/*
142 * Strip out flags meant for the parent.
143 * We keep things that are not inherited across execve(2) (e.g. capabilities),
144 * or are easier to set after execve(2) (e.g. seccomp filters).
145 */
146void minijail_preenter(struct minijail *j)
147{
148 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700149 j->flags.enter_vfs = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700150 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700151 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800152 j->flags.do_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800153 j->flags.pid_file = 0;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800154 j->flags.cgroups = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700155}
156
157/*
158 * Strip out flags meant for the child.
159 * We keep things that are inherited across execve(2).
160 */
161void minijail_preexec(struct minijail *j)
162{
163 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700164 int enter_vfs = j->flags.enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700165 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800166 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700167 if (j->user)
168 free(j->user);
169 j->user = NULL;
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -0800170 if (j->suppl_gid_list)
171 free(j->suppl_gid_list);
172 j->suppl_gid_list = NULL;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700173 memset(&j->flags, 0, sizeof(j->flags));
174 /* Now restore anything we meant to keep. */
175 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700176 j->flags.enter_vfs = enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700177 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800178 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700179 /* Note, |pids| will already have been used before this call. */
180}
181
Jorge Lucangeli Obes272e3ab2016-01-12 21:18:59 -0800182/* Returns true if the kernel version is less than 3.8. */
183int seccomp_kernel_support_not_required()
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800184{
185 int major, minor;
186 struct utsname uts;
187 return (uname(&uts) != -1 &&
188 sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
189 ((major < 3) || ((major == 3) && (minor < 8))));
190}
191
Jorge Lucangeli Obes272e3ab2016-01-12 21:18:59 -0800192/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */
193int can_softfail()
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800194{
195#if SECCOMP_SOFTFAIL
196 if (is_android()) {
197 if (seccomp_kernel_support_not_required())
198 return 1;
199 else
200 return 0;
201 } else {
202 return 1;
203 }
204#endif
205 return 0;
206}
207
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700208/* Minijail API. */
209
Will Drewry6ac91122011-10-21 16:38:58 -0500210struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400211{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400212 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400213}
214
Will Drewry6ac91122011-10-21 16:38:58 -0500215void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400216{
217 if (uid == 0)
218 die("useless change to uid 0");
219 j->uid = uid;
220 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400221}
222
Will Drewry6ac91122011-10-21 16:38:58 -0500223void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400224{
225 if (gid == 0)
226 die("useless change to gid 0");
227 j->gid = gid;
228 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400229}
230
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800231void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
232 const gid_t *list)
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800233{
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800234 size_t i;
235
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800236 if (j->flags.usergroups)
237 die("cannot inherit *and* set supplementary groups");
238
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800239 if (size == 0) {
240 /* Clear supplementary groups. */
241 j->suppl_gid_list = NULL;
242 j->suppl_gid_count = 0;
243 j->flags.suppl_gids = 1;
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800244 return;
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800245 }
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800246
247 /* Copy the gid_t array. */
248 j->suppl_gid_list = calloc(size, sizeof(gid_t));
249 if (!j->suppl_gid_list) {
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800250 die("failed to allocate internal supplementary group array");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800251 }
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800252 for (i = 0; i < size; i++) {
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800253 j->suppl_gid_list[i] = list[i];
254 }
255 j->suppl_gid_count = size;
256 j->flags.suppl_gids = 1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800257}
258
Will Drewry6ac91122011-10-21 16:38:58 -0500259int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400260{
261 char *buf = NULL;
262 struct passwd pw;
263 struct passwd *ppw = NULL;
264 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
265 if (sz == -1)
266 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400267
Elly Jonesdd3e8512012-01-23 15:13:38 -0500268 /*
269 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400270 * the maximum needed size of the buffer, so we don't have to search.
271 */
272 buf = malloc(sz);
273 if (!buf)
274 return -ENOMEM;
275 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500276 /*
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800277 * We're safe to free the buffer here. The strings inside |pw| point
278 * inside |buf|, but we don't use any of them; this leaves the pointers
279 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3) succeeded.
Elly Jonesdd3e8512012-01-23 15:13:38 -0500280 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400281 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700282 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400283 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700284 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400285 minijail_change_uid(j, ppw->pw_uid);
286 j->user = strdup(user);
287 if (!j->user)
288 return -ENOMEM;
289 j->usergid = ppw->pw_gid;
290 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400291}
292
Will Drewry6ac91122011-10-21 16:38:58 -0500293int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400294{
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -0700295 char *buf = NULL;
Yabin Cui1b21c8f2015-07-22 10:34:45 -0700296 struct group gr;
297 struct group *pgr = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400298 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
299 if (sz == -1)
300 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400301
Elly Jonesdd3e8512012-01-23 15:13:38 -0500302 /*
303 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400304 * the maximum needed size of the buffer, so we don't have to search.
305 */
306 buf = malloc(sz);
307 if (!buf)
308 return -ENOMEM;
309 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500310 /*
311 * We're safe to free the buffer here. The strings inside gr point
312 * inside buf, but we don't use any of them; this leaves the pointers
313 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
314 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400315 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700316 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400317 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700318 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400319 minijail_change_gid(j, pgr->gr_gid);
320 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400321}
322
Will Drewry6ac91122011-10-21 16:38:58 -0500323void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400324{
325 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400326}
327
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700328void API minijail_no_new_privs(struct minijail *j)
329{
330 j->flags.no_new_privs = 1;
331}
332
Will Drewry6ac91122011-10-21 16:38:58 -0500333void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400334{
335 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500336}
337
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700338void API minijail_log_seccomp_filter_failures(struct minijail *j)
339{
340 j->flags.log_seccomp_filter = 1;
341}
342
Will Drewry6ac91122011-10-21 16:38:58 -0500343void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400344{
345 j->caps = capmask;
346 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400347}
348
Peter Qiu2860c462015-12-16 15:13:06 -0800349void API minijail_reset_signal_mask(struct minijail* j) {
350 j->flags.reset_signal_mask = 1;
351}
352
Will Drewry6ac91122011-10-21 16:38:58 -0500353void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400354{
355 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400356}
357
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700358void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
359{
360 int ns_fd = open(ns_path, O_RDONLY);
361 if (ns_fd < 0) {
362 pdie("failed to open namespace '%s'", ns_path);
363 }
364 j->mountns_fd = ns_fd;
365 j->flags.enter_vfs = 1;
366}
367
Will Drewry6ac91122011-10-21 16:38:58 -0500368void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400369{
Elly Jonese58176c2012-01-23 11:46:17 -0500370 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700371 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400372 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800373 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400374}
375
Dylan Reidf7942472015-11-18 17:55:26 -0800376void API minijail_namespace_ipc(struct minijail *j)
377{
378 j->flags.ipc = 1;
379}
380
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400381void API minijail_namespace_net(struct minijail *j)
382{
383 j->flags.net = 1;
384}
385
Dylan Reid1102f5a2015-09-15 11:52:20 -0700386void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
387{
388 int ns_fd = open(ns_path, O_RDONLY);
389 if (ns_fd < 0) {
390 pdie("failed to open namespace '%s'", ns_path);
391 }
392 j->netns_fd = ns_fd;
393 j->flags.enter_net = 1;
394}
395
Dylan Reid791f5772015-09-14 20:02:42 -0700396void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400397{
398 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700399 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400400}
401
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800402void API minijail_namespace_user(struct minijail *j)
403{
404 j->flags.userns = 1;
405}
406
407int API minijail_uidmap(struct minijail *j, const char *uidmap)
408{
409 j->uidmap = strdup(uidmap);
410 if (!j->uidmap)
411 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800412 char *ch;
413 for (ch = j->uidmap; *ch; ch++) {
414 if (*ch == ',')
415 *ch = '\n';
416 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800417 return 0;
418}
419
420int API minijail_gidmap(struct minijail *j, const char *gidmap)
421{
422 j->gidmap = strdup(gidmap);
423 if (!j->gidmap)
424 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800425 char *ch;
426 for (ch = j->gidmap; *ch; ch++) {
427 if (*ch == ',')
428 *ch = '\n';
429 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800430 return 0;
431}
432
Will Drewry6ac91122011-10-21 16:38:58 -0500433void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400434{
435 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400436}
437
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800438void API minijail_run_as_init(struct minijail *j)
439{
440 /*
441 * Since the jailed program will become 'init' in the new PID namespace,
442 * Minijail does not need to fork an 'init' process.
443 */
444 j->flags.do_init = 0;
445}
446
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700447int API minijail_enter_chroot(struct minijail *j, const char *dir)
448{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400449 if (j->chrootdir)
450 return -EINVAL;
451 j->chrootdir = strdup(dir);
452 if (!j->chrootdir)
453 return -ENOMEM;
454 j->flags.chroot = 1;
455 return 0;
456}
457
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800458int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
459{
460 if (j->chrootdir)
461 return -EINVAL;
462 j->chrootdir = strdup(dir);
463 if (!j->chrootdir)
464 return -ENOMEM;
465 j->flags.pivot_root = 1;
466 return 0;
467}
468
Dylan Reida14e08d2015-10-22 21:05:29 -0700469static char *append_external_path(const char *external_path,
470 const char *path_inside_chroot)
Dylan Reid08946cc2015-09-16 19:10:57 -0700471{
Dylan Reida14e08d2015-10-22 21:05:29 -0700472 char *path;
Dylan Reid08946cc2015-09-16 19:10:57 -0700473 size_t pathlen;
474
Dylan Reid08946cc2015-09-16 19:10:57 -0700475 /* One extra char for '/' and one for '\0', hence + 2. */
Dylan Reida14e08d2015-10-22 21:05:29 -0700476 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
477 path = malloc(pathlen);
478 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700479
Dylan Reida14e08d2015-10-22 21:05:29 -0700480 return path;
481}
482
483char API *minijail_get_original_path(struct minijail *j,
484 const char *path_inside_chroot)
485{
Dylan Reid648b2202015-10-23 00:50:00 -0700486 struct mountpoint *b;
Dylan Reida14e08d2015-10-22 21:05:29 -0700487
Dylan Reid648b2202015-10-23 00:50:00 -0700488 b = j->mounts_head;
Dylan Reida14e08d2015-10-22 21:05:29 -0700489 while (b) {
490 /*
491 * If |path_inside_chroot| is the exact destination of a
Dylan Reid648b2202015-10-23 00:50:00 -0700492 * mount, then the original path is exactly the source of
493 * the mount.
Dylan Reida14e08d2015-10-22 21:05:29 -0700494 * for example: "-b /some/path/exe,/chroot/path/exe"
Dylan Reid648b2202015-10-23 00:50:00 -0700495 * mount source = /some/path/exe, mount dest =
496 * /chroot/path/exe Then when getting the original path of
497 * "/chroot/path/exe", the source of that mount,
498 * "/some/path/exe" is what should be returned.
Dylan Reida14e08d2015-10-22 21:05:29 -0700499 */
500 if (!strcmp(b->dest, path_inside_chroot))
501 return strdup(b->src);
502
503 /*
504 * If |path_inside_chroot| is within the destination path of a
Dylan Reid648b2202015-10-23 00:50:00 -0700505 * mount, take the suffix of the chroot path relative to the
506 * mount destination path, and append it to the mount source
507 * path.
Dylan Reida14e08d2015-10-22 21:05:29 -0700508 */
509 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
510 const char *relative_path =
511 path_inside_chroot + strlen(b->dest);
512 return append_external_path(b->src, relative_path);
513 }
514 b = b->next;
515 }
516
517 /* If there is a chroot path, append |path_inside_chroot| to that. */
518 if (j->chrootdir)
519 return append_external_path(j->chrootdir, path_inside_chroot);
520
521 /* No chroot, so the path outside is the same as it is inside. */
522 return strdup(path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700523}
524
Lee Campbell11af0622014-05-22 12:36:04 -0700525void API minijail_mount_tmp(struct minijail *j)
526{
527 j->flags.mount_tmp = 1;
528}
529
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800530int API minijail_write_pid_file(struct minijail *j, const char *path)
531{
532 j->pid_file_path = strdup(path);
533 if (!j->pid_file_path)
534 return -ENOMEM;
535 j->flags.pid_file = 1;
536 return 0;
537}
538
Dylan Reid605ce7f2016-01-19 19:21:00 -0800539int API minijail_add_to_cgroup(struct minijail *j, const char *path)
540{
541 if (j->cgroup_count >= MAX_CGROUPS)
542 return -ENOMEM;
543 j->cgroups[j->cgroup_count] = strdup(path);
544 if (!j->cgroups[j->cgroup_count])
545 return -ENOMEM;
546 j->cgroup_count++;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800547 j->flags.cgroups = 1;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800548 return 0;
549}
550
Dylan Reid648b2202015-10-23 00:50:00 -0700551int API minijail_mount(struct minijail *j, const char *src, const char *dest,
552 const char *type, unsigned long flags)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700553{
Dylan Reid648b2202015-10-23 00:50:00 -0700554 struct mountpoint *m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400555
556 if (*dest != '/')
557 return -EINVAL;
Dylan Reid648b2202015-10-23 00:50:00 -0700558 m = calloc(1, sizeof(*m));
559 if (!m)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400560 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700561 m->dest = strdup(dest);
562 if (!m->dest)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400563 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700564 m->src = strdup(src);
565 if (!m->src)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400566 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700567 m->type = strdup(type);
568 if (!m->type)
569 goto error;
570 m->flags = flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400571
Dylan Reid648b2202015-10-23 00:50:00 -0700572 info("mount %s -> %s type %s", src, dest, type);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400573
Elly Jonesdd3e8512012-01-23 15:13:38 -0500574 /*
Dylan Reid648b2202015-10-23 00:50:00 -0700575 * Force vfs namespacing so the mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400576 * containing vfs namespace.
577 */
578 minijail_namespace_vfs(j);
579
Dylan Reid648b2202015-10-23 00:50:00 -0700580 if (j->mounts_tail)
581 j->mounts_tail->next = m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400582 else
Dylan Reid648b2202015-10-23 00:50:00 -0700583 j->mounts_head = m;
584 j->mounts_tail = m;
585 j->mounts_count++;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400586
587 return 0;
588
589error:
Dylan Reid648b2202015-10-23 00:50:00 -0700590 free(m->src);
591 free(m->dest);
592 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400593 return -ENOMEM;
594}
595
Dylan Reid648b2202015-10-23 00:50:00 -0700596int API minijail_bind(struct minijail *j, const char *src, const char *dest,
597 int writeable)
598{
599 unsigned long flags = MS_BIND;
600
601 if (!writeable)
602 flags |= MS_RDONLY;
603
604 return minijail_mount(j, src, dest, "", flags);
605}
606
Will Drewry6ac91122011-10-21 16:38:58 -0500607void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400608{
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700609 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800610 if ((errno == EINVAL) && can_softfail()) {
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -0800611 warn("not loading seccomp filter,"
612 " seccomp not supported");
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800613 j->flags.seccomp_filter = 0;
614 j->flags.log_seccomp_filter = 0;
615 j->filter_len = 0;
616 j->filter_prog = NULL;
617 j->flags.no_new_privs = 0;
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700618 }
619 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400620 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800621 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700622 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400623 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800624
625 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700626 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
627 die("failed to compile seccomp filter BPF program in '%s'",
628 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800629 }
630
631 j->filter_len = fprog->len;
632 j->filter_prog = fprog;
633
Elly Jonese1749eb2011-10-07 13:54:59 -0400634 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500635}
636
Andrew Brestickereac28942015-11-11 16:04:46 -0800637int API minijail_use_alt_syscall(struct minijail *j, const char *table)
638{
639 j->alt_syscall_table = strdup(table);
640 if (!j->alt_syscall_table)
641 return -ENOMEM;
642 j->flags.alt_syscall = 1;
643 return 0;
644}
645
Will Drewryf89aef52011-09-16 16:48:57 -0500646struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400647 size_t available;
648 size_t total;
649 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500650};
651
Will Drewry6ac91122011-10-21 16:38:58 -0500652void marshal_state_init(struct marshal_state *state,
653 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400654{
655 state->available = available;
656 state->buf = buf;
657 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500658}
659
Will Drewry6ac91122011-10-21 16:38:58 -0500660void marshal_append(struct marshal_state *state,
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800661 void *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400662{
663 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500664
Elly Jonese1749eb2011-10-07 13:54:59 -0400665 /* Up to |available| will be written. */
666 if (copy_len) {
667 memcpy(state->buf, src, copy_len);
668 state->buf += copy_len;
669 state->available -= copy_len;
670 }
671 /* |total| will contain the expected length. */
672 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500673}
674
Will Drewry6ac91122011-10-21 16:38:58 -0500675void minijail_marshal_helper(struct marshal_state *state,
676 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400677{
Dylan Reid648b2202015-10-23 00:50:00 -0700678 struct mountpoint *m = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800679 size_t i;
680
Elly Jonese1749eb2011-10-07 13:54:59 -0400681 marshal_append(state, (char *)j, sizeof(*j));
682 if (j->user)
683 marshal_append(state, j->user, strlen(j->user) + 1);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800684 if (j->suppl_gid_list) {
685 marshal_append(state, j->suppl_gid_list,
686 j->suppl_gid_count * sizeof(gid_t));
687 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400688 if (j->chrootdir)
689 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Andrew Brestickereac28942015-11-11 16:04:46 -0800690 if (j->alt_syscall_table) {
691 marshal_append(state, j->alt_syscall_table,
692 strlen(j->alt_syscall_table) + 1);
693 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800694 if (j->flags.seccomp_filter && j->filter_prog) {
695 struct sock_fprog *fp = j->filter_prog;
696 marshal_append(state, (char *)fp->filter,
697 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400698 }
Dylan Reid648b2202015-10-23 00:50:00 -0700699 for (m = j->mounts_head; m; m = m->next) {
700 marshal_append(state, m->src, strlen(m->src) + 1);
701 marshal_append(state, m->dest, strlen(m->dest) + 1);
702 marshal_append(state, m->type, strlen(m->type) + 1);
703 marshal_append(state, (char *)&m->flags, sizeof(m->flags));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400704 }
Dylan Reid605ce7f2016-01-19 19:21:00 -0800705 for (i = 0; i < j->cgroup_count; ++i)
706 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
Will Drewryf89aef52011-09-16 16:48:57 -0500707}
708
Will Drewry6ac91122011-10-21 16:38:58 -0500709size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400710{
711 struct marshal_state state;
712 marshal_state_init(&state, NULL, 0);
713 minijail_marshal_helper(&state, j);
714 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500715}
716
Elly Jonese1749eb2011-10-07 13:54:59 -0400717int minijail_marshal(const struct minijail *j, char *buf, size_t available)
718{
719 struct marshal_state state;
720 marshal_state_init(&state, buf, available);
721 minijail_marshal_helper(&state, j);
722 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500723}
724
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -0800725/*
726 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
Elly Jones51a5b6c2011-10-12 19:09:26 -0400727 * @length Number of bytes to consume
728 * @buf Buffer to consume from
729 * @buflength Size of @buf
730 *
731 * Returns a pointer to the base of the bytes, or NULL for errors.
732 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700733void *consumebytes(size_t length, char **buf, size_t *buflength)
734{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400735 char *p = *buf;
736 if (length > *buflength)
737 return NULL;
738 *buf += length;
739 *buflength -= length;
740 return p;
741}
742
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -0800743/*
744 * consumestr: consumes a C string from a buffer @buf of length @length
Elly Jones51a5b6c2011-10-12 19:09:26 -0400745 * @buf Buffer to consume
746 * @length Length of buffer
747 *
748 * Returns a pointer to the base of the string, or NULL for errors.
749 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700750char *consumestr(char **buf, size_t *buflength)
751{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400752 size_t len = strnlen(*buf, *buflength);
753 if (len == *buflength)
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -0700754 /* There's no null-terminator. */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400755 return NULL;
756 return consumebytes(len + 1, buf, buflength);
757}
758
Elly Jonese1749eb2011-10-07 13:54:59 -0400759int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
760{
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800761 size_t i;
762 size_t count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500763 int ret = -EINVAL;
764
Elly Jonese1749eb2011-10-07 13:54:59 -0400765 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500766 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400767 memcpy((void *)j, serialized, sizeof(*j));
768 serialized += sizeof(*j);
769 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500770
Will Drewrybee7ba72011-10-21 20:47:01 -0500771 /* Potentially stale pointers not used as signals. */
Dylan Reid648b2202015-10-23 00:50:00 -0700772 j->mounts_head = NULL;
773 j->mounts_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800774 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500775
Elly Jonese1749eb2011-10-07 13:54:59 -0400776 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400777 char *user = consumestr(&serialized, &length);
778 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500779 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400780 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500781 if (!j->user)
782 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400783 }
Will Drewryf89aef52011-09-16 16:48:57 -0500784
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800785 if (j->suppl_gid_list) { /* stale pointer */
786 if (j->suppl_gid_count > NGROUPS_MAX) {
787 goto bad_gid_list;
788 }
789 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
790 void *gid_list_bytes =
791 consumebytes(gid_list_size, &serialized, &length);
792 if (!gid_list_bytes)
793 goto bad_gid_list;
794
795 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
796 if (!j->suppl_gid_list)
797 goto bad_gid_list;
798
799 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
800 }
801
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400802 if (j->chrootdir) { /* stale pointer */
803 char *chrootdir = consumestr(&serialized, &length);
804 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500805 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400806 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500807 if (!j->chrootdir)
808 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400809 }
810
Andrew Brestickereac28942015-11-11 16:04:46 -0800811 if (j->alt_syscall_table) { /* stale pointer */
812 char *alt_syscall_table = consumestr(&serialized, &length);
813 if (!alt_syscall_table)
814 goto bad_syscall_table;
815 j->alt_syscall_table = strdup(alt_syscall_table);
816 if (!j->alt_syscall_table)
817 goto bad_syscall_table;
818 }
819
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800820 if (j->flags.seccomp_filter && j->filter_len > 0) {
821 size_t ninstrs = j->filter_len;
822 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
823 ninstrs > USHRT_MAX)
824 goto bad_filters;
825
826 size_t program_len = ninstrs * sizeof(struct sock_filter);
827 void *program = consumebytes(program_len, &serialized, &length);
828 if (!program)
829 goto bad_filters;
830
831 j->filter_prog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800832 if (!j->filter_prog)
833 goto bad_filters;
834
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800835 j->filter_prog->len = ninstrs;
836 j->filter_prog->filter = malloc(program_len);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800837 if (!j->filter_prog->filter)
838 goto bad_filter_prog_instrs;
839
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800840 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400841 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400842
Dylan Reid648b2202015-10-23 00:50:00 -0700843 count = j->mounts_count;
844 j->mounts_count = 0;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400845 for (i = 0; i < count; ++i) {
Dylan Reid648b2202015-10-23 00:50:00 -0700846 unsigned long *flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400847 const char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -0700848 const char *type;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400849 const char *src = consumestr(&serialized, &length);
850 if (!src)
Dylan Reid648b2202015-10-23 00:50:00 -0700851 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400852 dest = consumestr(&serialized, &length);
853 if (!dest)
Dylan Reid648b2202015-10-23 00:50:00 -0700854 goto bad_mounts;
855 type = consumestr(&serialized, &length);
856 if (!type)
857 goto bad_mounts;
858 flags = consumebytes(sizeof(*flags), &serialized, &length);
859 if (!flags)
860 goto bad_mounts;
861 if (minijail_mount(j, src, dest, type, *flags))
862 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400863 }
864
Dylan Reid605ce7f2016-01-19 19:21:00 -0800865 count = j->cgroup_count;
866 j->cgroup_count = 0;
867 for (i = 0; i < count; ++i) {
868 char *cgroup = consumestr(&serialized, &length);
869 if (!cgroup)
870 goto bad_cgroups;
871 j->cgroups[i] = strdup(cgroup);
872 if (!j->cgroups[i])
873 goto bad_cgroups;
874 ++j->cgroup_count;
875 }
876
Elly Jonese1749eb2011-10-07 13:54:59 -0400877 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500878
Dylan Reid605ce7f2016-01-19 19:21:00 -0800879bad_cgroups:
880 while (j->mounts_head) {
881 struct mountpoint *m = j->mounts_head;
882 j->mounts_head = j->mounts_head->next;
883 free(m->type);
884 free(m->dest);
885 free(m->src);
886 free(m);
887 }
888 for (i = 0; i < j->cgroup_count; ++i)
889 free(j->cgroups[i]);
Dylan Reid648b2202015-10-23 00:50:00 -0700890bad_mounts:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800891 if (j->flags.seccomp_filter && j->filter_len > 0) {
892 free(j->filter_prog->filter);
893 free(j->filter_prog);
894 }
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800895bad_filter_prog_instrs:
896 if (j->filter_prog)
897 free(j->filter_prog);
Will Drewrybee7ba72011-10-21 20:47:01 -0500898bad_filters:
Andrew Brestickereac28942015-11-11 16:04:46 -0800899 if (j->alt_syscall_table)
900 free(j->alt_syscall_table);
901bad_syscall_table:
Will Drewrybee7ba72011-10-21 20:47:01 -0500902 if (j->chrootdir)
903 free(j->chrootdir);
904bad_chrootdir:
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800905 if (j->suppl_gid_list)
906 free(j->suppl_gid_list);
907bad_gid_list:
Will Drewrybee7ba72011-10-21 20:47:01 -0500908 if (j->user)
909 free(j->user);
910clear_pointers:
911 j->user = NULL;
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800912 j->suppl_gid_list = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500913 j->chrootdir = NULL;
Andrew Brestickereac28942015-11-11 16:04:46 -0800914 j->alt_syscall_table = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800915 j->cgroup_count = 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500916out:
917 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500918}
919
Dylan Reidce5b55e2016-01-13 11:04:16 -0800920static void write_ugid_mappings(const struct minijail *j)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800921{
922 int fd, ret, len;
923 size_t sz;
924 char fname[32];
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800925
926 sz = sizeof(fname);
927 if (j->uidmap) {
928 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700929 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800930 die("failed to write file name of uid_map");
931 fd = open(fname, O_WRONLY);
932 if (fd < 0)
933 pdie("failed to open '%s'", fname);
934 len = strlen(j->uidmap);
935 if (write(fd, j->uidmap, len) < len)
936 die("failed to set uid_map");
937 close(fd);
938 }
939 if (j->gidmap) {
940 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700941 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800942 die("failed to write file name of gid_map");
943 fd = open(fname, O_WRONLY);
944 if (fd < 0)
945 pdie("failed to open '%s'", fname);
946 len = strlen(j->gidmap);
947 if (write(fd, j->gidmap, len) < len)
948 die("failed to set gid_map");
949 close(fd);
950 }
Dylan Reidce5b55e2016-01-13 11:04:16 -0800951}
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800952
Dylan Reidce5b55e2016-01-13 11:04:16 -0800953static void parent_setup_complete(int *pipe_fds)
954{
955 close(pipe_fds[0]);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800956 close(pipe_fds[1]);
957}
958
Dylan Reidce5b55e2016-01-13 11:04:16 -0800959/*
960 * wait_for_parent_setup: Called by the child process to wait for any
961 * further parent-side setup to complete before continuing.
962 */
963static void wait_for_parent_setup(int *pipe_fds)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800964{
965 char buf;
966
967 close(pipe_fds[1]);
968
Dylan Reidce5b55e2016-01-13 11:04:16 -0800969 /* Wait for parent to complete setup and close the pipe. */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800970 if (read(pipe_fds[0], &buf, 1) != 0)
971 die("failed to sync with parent");
972 close(pipe_fds[0]);
Dylan Reidce5b55e2016-01-13 11:04:16 -0800973}
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800974
Dylan Reidce5b55e2016-01-13 11:04:16 -0800975static void enter_user_namespace(const struct minijail *j)
976{
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800977 if (j->uidmap && setresuid(0, 0, 0))
978 pdie("setresuid");
979 if (j->gidmap && setresgid(0, 0, 0))
980 pdie("setresgid");
981}
982
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -0800983/*
984 * mount_one: Applies mounts from @m for @j, recursing as needed.
Dylan Reid648b2202015-10-23 00:50:00 -0700985 * @j Minijail these mounts are for
986 * @m Head of list of mounts
Elly Jones51a5b6c2011-10-12 19:09:26 -0400987 *
988 * Returns 0 for success.
989 */
Dylan Reid648b2202015-10-23 00:50:00 -0700990static int mount_one(const struct minijail *j, struct mountpoint *m)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700991{
Dylan Reid648b2202015-10-23 00:50:00 -0700992 int ret;
993 char *dest;
994 int remount_ro = 0;
995
Elly Jones51a5b6c2011-10-12 19:09:26 -0400996 /* dest has a leading "/" */
Dylan Reid648b2202015-10-23 00:50:00 -0700997 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400998 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700999
1000 /*
1001 * R/O bind mounts have to be remounted since bind and ro can't both be
1002 * specified in the original bind mount. Remount R/O after the initial
1003 * mount.
1004 */
1005 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1006 remount_ro = 1;
1007 m->flags &= ~MS_RDONLY;
Elly Jonesa1059632011-12-15 15:17:07 -05001008 }
Dylan Reid648b2202015-10-23 00:50:00 -07001009
1010 ret = mount(m->src, dest, m->type, m->flags, NULL);
1011 if (ret)
1012 pdie("mount: %s -> %s", m->src, dest);
1013
1014 if (remount_ro) {
1015 m->flags |= MS_RDONLY;
1016 ret = mount(m->src, dest, NULL,
1017 m->flags | MS_REMOUNT, NULL);
1018 if (ret)
1019 pdie("bind ro: %s -> %s", m->src, dest);
1020 }
1021
Elly Jones51a5b6c2011-10-12 19:09:26 -04001022 free(dest);
Dylan Reid648b2202015-10-23 00:50:00 -07001023 if (m->next)
1024 return mount_one(j, m->next);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001025 return ret;
1026}
1027
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001028int enter_chroot(const struct minijail *j)
1029{
Elly Jones51a5b6c2011-10-12 19:09:26 -04001030 int ret;
Dylan Reid648b2202015-10-23 00:50:00 -07001031
1032 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
Elly Jones51a5b6c2011-10-12 19:09:26 -04001033 return ret;
1034
1035 if (chroot(j->chrootdir))
1036 return -errno;
1037
1038 if (chdir("/"))
1039 return -errno;
1040
1041 return 0;
1042}
1043
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001044int enter_pivot_root(const struct minijail *j)
1045{
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001046 int ret, oldroot, newroot;
Dylan Reid648b2202015-10-23 00:50:00 -07001047
1048 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001049 return ret;
1050
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001051 /*
1052 * Keep the fd for both old and new root.
1053 * It will be used in fchdir later.
1054 */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001055 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1056 if (oldroot < 0)
1057 pdie("failed to open / for fchdir");
1058 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
1059 if (newroot < 0)
1060 pdie("failed to open %s for fchdir", j->chrootdir);
1061
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001062 /*
1063 * To ensure chrootdir is the root of a file system,
1064 * do a self bind mount.
1065 */
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001066 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1067 pdie("failed to bind mount '%s'", j->chrootdir);
1068 if (chdir(j->chrootdir))
1069 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001070 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001071 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001072
1073 /*
1074 * Now the old root is mounted on top of the new root. Use fchdir to
1075 * change to the old root and unmount it.
1076 */
1077 if (fchdir(oldroot))
1078 pdie("failed to fchdir to old /");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001079 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001080 if (umount2(".", MNT_DETACH))
1081 pdie("umount(/)");
1082 /* Change back to the new root. */
1083 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001084 return -errno;
1085 if (chroot("/"))
1086 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -07001087 /* Set correct CWD for getcwd(3). */
1088 if (chdir("/"))
1089 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001090
1091 return 0;
1092}
1093
Lee Campbell11af0622014-05-22 12:36:04 -07001094int mount_tmp(void)
1095{
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001096 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
Lee Campbell11af0622014-05-22 12:36:04 -07001097}
1098
Dylan Reid791f5772015-09-14 20:02:42 -07001099int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001100{
1101 const char *kProcPath = "/proc";
1102 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -05001103 /*
1104 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -04001105 * /proc in our namespace, which means using MS_REMOUNT here would
1106 * mutate our parent's mount as well, even though we're in a VFS
1107 * namespace (!). Instead, remove their mount from our namespace
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001108 * and make our own. However, if we are in a new user namespace, /proc
1109 * is not seen as mounted, so don't return error if umount() fails.
Elly Jonese1749eb2011-10-07 13:54:59 -04001110 */
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001111 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
Elly Jonese1749eb2011-10-07 13:54:59 -04001112 return -errno;
1113 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1114 return -errno;
1115 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -04001116}
1117
Dylan Reid605ce7f2016-01-19 19:21:00 -08001118static void write_pid_to_path(pid_t pid, const char *path)
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001119{
Dylan Reid605ce7f2016-01-19 19:21:00 -08001120 FILE *fp = fopen(path, "w");
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001121
1122 if (!fp)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001123 pdie("failed to open '%s'", path);
1124 if (fprintf(fp, "%d\n", (int)pid) < 0)
1125 pdie("fprintf(%s)", path);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001126 if (fclose(fp))
Dylan Reid605ce7f2016-01-19 19:21:00 -08001127 pdie("fclose(%s)", path);
1128}
1129
1130static void write_pid_file(const struct minijail *j)
1131{
1132 write_pid_to_path(j->initpid, j->pid_file_path);
1133}
1134
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08001135static void add_to_cgroups(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001136{
1137 size_t i;
1138
1139 for (i = 0; i < j->cgroup_count; ++i)
1140 write_pid_to_path(j->initpid, j->cgroups[i]);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001141}
1142
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001143void drop_ugid(const struct minijail *j)
1144{
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001145 if (j->flags.usergroups && j->flags.suppl_gids) {
1146 die("tried to inherit *and* set supplementary groups;"
1147 " can only do one");
1148 }
1149
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001150 if (j->flags.usergroups) {
1151 if (initgroups(j->user, j->usergid))
1152 pdie("initgroups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001153 } else if (j->flags.suppl_gids) {
1154 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1155 pdie("setgroups");
1156 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001157 } else {
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001158 /*
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001159 * Only attempt to clear supplementary groups if we are changing
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001160 * users.
1161 */
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001162 if ((j->uid || j->gid) && setgroups(0, NULL))
1163 pdie("setgroups");
1164 }
1165
1166 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1167 pdie("setresgid");
1168
1169 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1170 pdie("setresuid");
1171}
1172
Mike Frysinger3adfef72013-05-09 17:19:08 -04001173/*
1174 * We specifically do not use cap_valid() as that only tells us the last
1175 * valid cap we were *compiled* against (i.e. what the version of kernel
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001176 * headers says). If we run on a different kernel version, then it's not
Mike Frysinger3adfef72013-05-09 17:19:08 -04001177 * uncommon for that to be less (if an older kernel) or more (if a newer
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001178 * kernel).
1179 * Normally, we suck up the answer via /proc. On Android, not all processes are
1180 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1181 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
Mike Frysinger3adfef72013-05-09 17:19:08 -04001182 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -07001183static unsigned int get_last_valid_cap()
Mike Frysinger3adfef72013-05-09 17:19:08 -04001184{
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001185 unsigned int last_valid_cap = 0;
1186 if (is_android()) {
1187 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1188 ++last_valid_cap);
Mike Frysinger3adfef72013-05-09 17:19:08 -04001189
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001190 /* |last_valid_cap| will be the first failing value. */
1191 if (last_valid_cap > 0) {
1192 last_valid_cap--;
1193 }
1194 } else {
1195 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1196 FILE *fp = fopen(cap_file, "re");
1197 if (fscanf(fp, "%u", &last_valid_cap) != 1)
1198 pdie("fscanf(%s)", cap_file);
1199 fclose(fp);
1200 }
Dylan Reidf682d472015-09-17 21:39:07 -07001201 return last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -04001202}
1203
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -07001204void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -04001205{
1206 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -08001207 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -08001208 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -04001209 unsigned int i;
1210 if (!caps)
1211 die("can't get process caps");
1212 if (cap_clear_flag(caps, CAP_INHERITABLE))
1213 die("can't clear inheritable caps");
1214 if (cap_clear_flag(caps, CAP_EFFECTIVE))
1215 die("can't clear effective caps");
1216 if (cap_clear_flag(caps, CAP_PERMITTED))
1217 die("can't clear permitted caps");
Dylan Reidf682d472015-09-17 21:39:07 -07001218 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -08001219 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001220 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -04001221 continue;
Kees Cook323878a2013-02-05 15:35:24 -08001222 flag[0] = i;
1223 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001224 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -08001225 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001226 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -08001227 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001228 die("can't add inheritable cap");
1229 }
1230 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -08001231 die("can't apply initial cleaned capset");
1232
1233 /*
1234 * Instead of dropping bounding set first, do it here in case
1235 * the caller had a more permissive bounding set which could
1236 * have been used above to raise a capability that wasn't already
1237 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1238 */
Dylan Reidf682d472015-09-17 21:39:07 -07001239 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -08001240 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -04001241 continue;
1242 if (prctl(PR_CAPBSET_DROP, i))
1243 pdie("prctl(PR_CAPBSET_DROP)");
1244 }
Kees Cook323878a2013-02-05 15:35:24 -08001245
1246 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001247 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -08001248 flag[0] = CAP_SETPCAP;
1249 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1250 die("can't clear effective cap");
1251 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1252 die("can't clear permitted cap");
1253 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1254 die("can't clear inheritable cap");
1255 }
1256
1257 if (cap_set_proc(caps))
1258 die("can't apply final cleaned capset");
1259
1260 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -04001261}
1262
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001263void set_seccomp_filter(const struct minijail *j)
1264{
1265 /*
1266 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1267 * in the kernel source tree for an explanation of the parameters.
1268 */
1269 if (j->flags.no_new_privs) {
1270 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1271 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1272 }
1273
1274 /*
1275 * If we're logging seccomp filter failures,
1276 * install the SIGSYS handler first.
1277 */
1278 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1279 if (install_sigsys_handler())
1280 pdie("install SIGSYS handler");
1281 warn("logging seccomp filter failures");
1282 }
1283
1284 /*
1285 * Install the syscall filter.
1286 */
1287 if (j->flags.seccomp_filter) {
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001288 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1289 j->filter_prog)) {
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -08001290 if ((errno == EINVAL) && can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001291 warn("seccomp not supported");
1292 return;
1293 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001294 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001295 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001296 }
1297}
1298
Will Drewry6ac91122011-10-21 16:38:58 -05001299void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001300{
Dylan Reidf682d472015-09-17 21:39:07 -07001301 /*
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001302 * If we're dropping caps, get the last valid cap from /proc now,
1303 * since /proc can be unmounted before drop_caps() is called.
Dylan Reidf682d472015-09-17 21:39:07 -07001304 */
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001305 unsigned int last_valid_cap = 0;
1306 if (j->flags.caps)
1307 last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -07001308
Elly Jonese1749eb2011-10-07 13:54:59 -04001309 if (j->flags.pids)
1310 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001311 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04001312
Elly Jonese1749eb2011-10-07 13:54:59 -04001313 if (j->flags.usergroups && !j->user)
1314 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -04001315
Elly Jonesdd3e8512012-01-23 15:13:38 -05001316 /*
1317 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04001318 * so we don't even try. If any of our operations fail, we abort() the
1319 * entire process.
1320 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001321 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1322 pdie("setns(CLONE_NEWNS)");
1323
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001324 if (j->flags.vfs) {
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001325 if (unshare(CLONE_NEWNS))
1326 pdie("unshare(vfs)");
1327 /*
1328 * Remount all filesystems as private. If they are shared
1329 * new bind mounts will creep out of our namespace.
1330 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1331 */
1332 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1333 pdie("mount(/, private)");
1334 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001335
Dylan Reidf7942472015-11-18 17:55:26 -08001336 if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1337 pdie("unshare(ipc)");
1338 }
1339
Dylan Reid1102f5a2015-09-15 11:52:20 -07001340 if (j->flags.enter_net) {
1341 if (setns(j->netns_fd, CLONE_NEWNET))
1342 pdie("setns(CLONE_NEWNET)");
1343 } else if (j->flags.net && unshare(CLONE_NEWNET)) {
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001344 pdie("unshare(net)");
Dylan Reid1102f5a2015-09-15 11:52:20 -07001345 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001346
Elly Jones51a5b6c2011-10-12 19:09:26 -04001347 if (j->flags.chroot && enter_chroot(j))
1348 pdie("chroot");
1349
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001350 if (j->flags.pivot_root && enter_pivot_root(j))
1351 pdie("pivot_root");
1352
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001353 if (j->flags.mount_tmp && mount_tmp())
Lee Campbell11af0622014-05-22 12:36:04 -07001354 pdie("mount_tmp");
1355
Dylan Reid791f5772015-09-14 20:02:42 -07001356 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04001357 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04001358
Elly Jonese1749eb2011-10-07 13:54:59 -04001359 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001360 /*
1361 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -04001362 * capability to change uids, our attempt to use setuid()
1363 * below will fail. Hang on to root caps across setuid(), then
1364 * lock securebits.
1365 */
1366 if (prctl(PR_SET_KEEPCAPS, 1))
1367 pdie("prctl(PR_SET_KEEPCAPS)");
1368 if (prctl
1369 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1370 pdie("prctl(PR_SET_SECUREBITS)");
1371 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001372
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001373 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001374 * If we're setting no_new_privs, we can drop privileges
1375 * before setting seccomp filter. This way filter policies
1376 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001377 */
1378 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001379 drop_ugid(j);
1380 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001381 drop_caps(j, last_valid_cap);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001382
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001383 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04001384 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001385 /*
1386 * If we're not setting no_new_privs,
1387 * we need to set seccomp filter *before* dropping privileges.
1388 * WARNING: this means that filter policies *must* allow
1389 * setgroups()/setresgid()/setresuid() for dropping root and
1390 * capget()/capset()/prctl() for dropping caps.
1391 */
1392 set_seccomp_filter(j);
1393
1394 drop_ugid(j);
1395 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001396 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04001397 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001398
Elly Jonesdd3e8512012-01-23 15:13:38 -05001399 /*
Andrew Brestickereac28942015-11-11 16:04:46 -08001400 * Select the specified alternate syscall table. The table must not
1401 * block prctl(2) if we're using seccomp as well.
1402 */
1403 if (j->flags.alt_syscall) {
1404 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1405 pdie("prctl(PR_ALT_SYSCALL)");
1406 }
1407
1408 /*
Elly Jonesdd3e8512012-01-23 15:13:38 -05001409 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04001410 * privilege-dropping syscalls :)
1411 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001412 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -08001413 if ((errno == EINVAL) && can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001414 warn("seccomp not supported");
1415 return;
1416 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001417 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001418 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001419}
1420
Will Drewry6ac91122011-10-21 16:38:58 -05001421/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04001422static int init_exitstatus = 0;
1423
Will Drewry6ac91122011-10-21 16:38:58 -05001424void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -04001425{
1426 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04001427}
1428
Will Drewry6ac91122011-10-21 16:38:58 -05001429int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04001430{
1431 pid_t pid;
1432 int status;
1433 /* so that we exit with the right status */
1434 signal(SIGTERM, init_term);
1435 /* TODO(wad) self jail with seccomp_filters here. */
1436 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001437 /*
1438 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04001439 * left inside our pid namespace or we get a signal.
1440 */
1441 if (pid == rootpid)
1442 init_exitstatus = status;
1443 }
1444 if (!WIFEXITED(init_exitstatus))
1445 _exit(MINIJAIL_ERR_INIT);
1446 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04001447}
1448
Will Drewry6ac91122011-10-21 16:38:58 -05001449int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001450{
1451 size_t sz = 0;
1452 size_t bytes = read(fd, &sz, sizeof(sz));
1453 char *buf;
1454 int r;
1455 if (sizeof(sz) != bytes)
1456 return -EINVAL;
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001457 if (sz > USHRT_MAX) /* arbitrary sanity check */
Elly Jonese1749eb2011-10-07 13:54:59 -04001458 return -E2BIG;
1459 buf = malloc(sz);
1460 if (!buf)
1461 return -ENOMEM;
1462 bytes = read(fd, buf, sz);
1463 if (bytes != sz) {
1464 free(buf);
1465 return -EINVAL;
1466 }
1467 r = minijail_unmarshal(j, buf, sz);
1468 free(buf);
1469 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001470}
1471
Will Drewry6ac91122011-10-21 16:38:58 -05001472int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04001473{
1474 char *buf;
1475 size_t sz = minijail_size(j);
1476 ssize_t written;
1477 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04001478
Elly Jonese1749eb2011-10-07 13:54:59 -04001479 if (!sz)
1480 return -EINVAL;
1481 buf = malloc(sz);
1482 r = minijail_marshal(j, buf, sz);
1483 if (r) {
1484 free(buf);
1485 return r;
1486 }
1487 /* Sends [size][minijail]. */
1488 written = write(fd, &sz, sizeof(sz));
1489 if (written != sizeof(sz)) {
1490 free(buf);
1491 return -EFAULT;
1492 }
1493 written = write(fd, buf, sz);
1494 if (written < 0 || (size_t) written != sz) {
1495 free(buf);
1496 return -EFAULT;
1497 }
1498 free(buf);
1499 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001500}
Elly Jonescd7a9042011-07-22 13:56:51 -04001501
Will Drewry6ac91122011-10-21 16:38:58 -05001502int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04001503{
Daniel Erat5b7a3182015-08-19 16:06:22 -06001504#if defined(__ANDROID__)
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001505 /* Don't use LDPRELOAD on Brillo. */
1506 return 0;
1507#else
Elly Jonese1749eb2011-10-07 13:54:59 -04001508 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1509 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1510 if (!newenv)
1511 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04001512
Elly Jonese1749eb2011-10-07 13:54:59 -04001513 /* Only insert a separating space if we have something to separate... */
1514 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1515 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04001516
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001517 /* setenv() makes a copy of the string we give it. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001518 setenv(kLdPreloadEnvVar, newenv, 1);
1519 free(newenv);
1520 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001521#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04001522}
1523
Will Drewry6ac91122011-10-21 16:38:58 -05001524int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04001525{
1526 int r = pipe(fds);
1527 char fd_buf[11];
1528 if (r)
1529 return r;
1530 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1531 if (r <= 0)
1532 return -EINVAL;
1533 setenv(kFdEnvVar, fd_buf, 1);
1534 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001535}
1536
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001537int setup_pipe_end(int fds[2], size_t index)
1538{
1539 if (index > 1)
1540 return -1;
1541
1542 close(fds[1 - index]);
1543 return fds[index];
1544}
1545
1546int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1547{
1548 if (index > 1)
1549 return -1;
1550
1551 close(fds[1 - index]);
1552 /* dup2(2) the corresponding end of the pipe into |fd|. */
1553 return dup2(fds[index], fd);
1554}
1555
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001556int minijail_run_internal(struct minijail *j, const char *filename,
1557 char *const argv[], pid_t *pchild_pid,
1558 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1559 int use_preload);
1560
Will Drewry6ac91122011-10-21 16:38:58 -05001561int API minijail_run(struct minijail *j, const char *filename,
1562 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04001563{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001564 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1565 true);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001566}
1567
1568int API minijail_run_pid(struct minijail *j, const char *filename,
1569 char *const argv[], pid_t *pchild_pid)
1570{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001571 return minijail_run_internal(j, filename, argv, pchild_pid,
1572 NULL, NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001573}
1574
1575int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001576 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001577{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001578 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1579 NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001580}
1581
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001582int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001583 char *const argv[], pid_t *pchild_pid,
1584 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001585{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001586 return minijail_run_internal(j, filename, argv, pchild_pid,
1587 pstdin_fd, pstdout_fd, pstderr_fd, true);
1588}
1589
1590int API minijail_run_no_preload(struct minijail *j, const char *filename,
1591 char *const argv[])
1592{
1593 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1594 false);
1595}
1596
Samuel Tan63187f42015-10-16 13:01:53 -07001597int API minijail_run_pid_pipes_no_preload(struct minijail *j,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001598 const char *filename,
1599 char *const argv[],
Samuel Tan63187f42015-10-16 13:01:53 -07001600 pid_t *pchild_pid,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001601 int *pstdin_fd, int *pstdout_fd,
1602 int *pstderr_fd) {
Samuel Tan63187f42015-10-16 13:01:53 -07001603 return minijail_run_internal(j, filename, argv, pchild_pid,
1604 pstdin_fd, pstdout_fd, pstderr_fd, false);
1605}
1606
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001607int minijail_run_internal(struct minijail *j, const char *filename,
1608 char *const argv[], pid_t *pchild_pid,
1609 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1610 int use_preload)
1611{
Elly Jonese1749eb2011-10-07 13:54:59 -04001612 char *oldenv, *oldenv_copy = NULL;
1613 pid_t child_pid;
1614 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001615 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001616 int stdout_fds[2];
1617 int stderr_fds[2];
Dylan Reidce5b55e2016-01-13 11:04:16 -08001618 int child_sync_pipe_fds[2];
1619 int sync_child = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -04001620 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001621 /* We need to remember this across the minijail_preexec() call. */
1622 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001623 int do_init = j->flags.do_init;
Ben Chan541c7e52011-08-26 14:55:53 -07001624
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001625 if (use_preload) {
1626 oldenv = getenv(kLdPreloadEnvVar);
1627 if (oldenv) {
1628 oldenv_copy = strdup(oldenv);
1629 if (!oldenv_copy)
1630 return -ENOMEM;
1631 }
1632
1633 if (setup_preload())
1634 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04001635 }
Will Drewryf89aef52011-09-16 16:48:57 -05001636
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001637 if (!use_preload) {
1638 if (j->flags.caps)
1639 die("Capabilities are not supported without "
1640 "LD_PRELOAD");
1641 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05001642
Elly Jonesdd3e8512012-01-23 15:13:38 -05001643 /*
Jorge Lucangeli Obes3c84df12015-05-14 17:37:58 -07001644 * Make the process group ID of this process equal to its PID, so that
1645 * both the Minijail process and the jailed process can be killed
1646 * together.
1647 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1648 * the process is already a process group leader.
1649 */
1650 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1651 if (errno != EPERM) {
1652 pdie("setpgid(0, 0)");
1653 }
1654 }
1655
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001656 if (use_preload) {
1657 /*
1658 * Before we fork(2) and execve(2) the child process, we need
1659 * to open a pipe(2) to send the minijail configuration over.
1660 */
1661 if (setup_pipe(pipe_fds))
1662 return -EFAULT;
1663 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001664
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001665 /*
1666 * If we want to write to the child process' standard input,
1667 * create the pipe(2) now.
1668 */
1669 if (pstdin_fd) {
1670 if (pipe(stdin_fds))
1671 return -EFAULT;
1672 }
1673
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001674 /*
1675 * If we want to read from the child process' standard output,
1676 * create the pipe(2) now.
1677 */
1678 if (pstdout_fd) {
1679 if (pipe(stdout_fds))
1680 return -EFAULT;
1681 }
1682
1683 /*
1684 * If we want to read from the child process' standard error,
1685 * create the pipe(2) now.
1686 */
1687 if (pstderr_fd) {
1688 if (pipe(stderr_fds))
1689 return -EFAULT;
1690 }
1691
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001692 /*
1693 * If we want to set up a new uid/gid mapping in the user namespace,
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08001694 * or if we need to add the child process to cgroups, create the pipe(2)
1695 * to sync between parent and child.
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001696 */
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08001697 if (j->flags.userns || j->flags.cgroups) {
Dylan Reidce5b55e2016-01-13 11:04:16 -08001698 sync_child = 1;
1699 if (pipe(child_sync_pipe_fds))
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001700 return -EFAULT;
1701 }
1702
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001703 /*
1704 * Use sys_clone() if and only if we're creating a pid namespace.
Elly Jones761b7412012-06-13 15:49:52 -04001705 *
1706 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1707 *
1708 * In multithreaded programs, there are a bunch of locks inside libc,
1709 * some of which may be held by other threads at the time that we call
1710 * minijail_run_pid(). If we call fork(), glibc does its level best to
1711 * ensure that we hold all of these locks before it calls clone()
1712 * internally and drop them after clone() returns, but when we call
1713 * sys_clone(2) directly, all that gets bypassed and we end up with a
1714 * child address space where some of libc's important locks are held by
1715 * other threads (which did not get cloned, and hence will never release
1716 * those locks). This is okay so long as we call exec() immediately
1717 * after, but a bunch of seemingly-innocent libc functions like setenv()
1718 * take locks.
1719 *
1720 * Hence, only call sys_clone() if we need to, in order to get at pid
1721 * namespacing. If we follow this path, the child's address space might
1722 * have broken locks; you may only call functions that do not acquire
1723 * any locks.
1724 *
1725 * Unfortunately, fork() acquires every lock it can get its hands on, as
1726 * previously detailed, so this function is highly likely to deadlock
1727 * later on (see "deadlock here") if we're multithreaded.
1728 *
1729 * We might hack around this by having the clone()d child (init of the
1730 * pid namespace) return directly, rather than leaving the clone()d
1731 * process hanging around to be init for the new namespace (and having
1732 * its fork()ed child return in turn), but that process would be crippled
1733 * with its libc locks potentially broken. We might try fork()ing in the
1734 * parent before we clone() to ensure that we own all the locks, but
1735 * then we have to have the forked child hanging around consuming
1736 * resources (and possibly having file descriptors / shared memory
1737 * regions / etc attached). We'd need to keep the child around to avoid
1738 * having its children get reparented to init.
1739 *
1740 * TODO(ellyjones): figure out if the "forked child hanging around"
1741 * problem is fixable or not. It would be nice if we worked in this
1742 * case.
1743 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001744 if (pid_namespace) {
1745 int clone_flags = CLONE_NEWPID | SIGCHLD;
1746 if (j->flags.userns)
1747 clone_flags |= CLONE_NEWUSER;
1748 child_pid = syscall(SYS_clone, clone_flags, NULL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001749 } else {
Elly Jones761b7412012-06-13 15:49:52 -04001750 child_pid = fork();
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001751 }
Elly Jones761b7412012-06-13 15:49:52 -04001752
Elly Jonese1749eb2011-10-07 13:54:59 -04001753 if (child_pid < 0) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001754 if (use_preload) {
1755 free(oldenv_copy);
1756 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001757 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001758 }
Will Drewryf89aef52011-09-16 16:48:57 -05001759
Elly Jonese1749eb2011-10-07 13:54:59 -04001760 if (child_pid) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001761 if (use_preload) {
1762 /* Restore parent's LD_PRELOAD. */
1763 if (oldenv_copy) {
1764 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1765 free(oldenv_copy);
1766 } else {
1767 unsetenv(kLdPreloadEnvVar);
1768 }
1769 unsetenv(kFdEnvVar);
Elly Jonese1749eb2011-10-07 13:54:59 -04001770 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001771
Elly Jonese1749eb2011-10-07 13:54:59 -04001772 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001773
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001774 if (j->flags.pid_file)
1775 write_pid_file(j);
1776
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08001777 if (j->flags.cgroups)
1778 add_to_cgroups(j);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001779
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001780 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08001781 write_ugid_mappings(j);
1782
1783 if (sync_child)
1784 parent_setup_complete(child_sync_pipe_fds);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001785
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001786 if (use_preload) {
1787 /* Send marshalled minijail. */
1788 close(pipe_fds[0]); /* read endpoint */
1789 ret = minijail_to_fd(j, pipe_fds[1]);
1790 close(pipe_fds[1]); /* write endpoint */
1791 if (ret) {
1792 kill(j->initpid, SIGKILL);
1793 die("failed to send marshalled minijail");
1794 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001795 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001796
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001797 if (pchild_pid)
1798 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001799
1800 /*
1801 * If we want to write to the child process' standard input,
1802 * set up the write end of the pipe.
1803 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001804 if (pstdin_fd)
1805 *pstdin_fd = setup_pipe_end(stdin_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001806 1 /* write end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001807
1808 /*
1809 * If we want to read from the child process' standard output,
1810 * set up the read end of the pipe.
1811 */
1812 if (pstdout_fd)
1813 *pstdout_fd = setup_pipe_end(stdout_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001814 0 /* read end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001815
1816 /*
1817 * If we want to read from the child process' standard error,
1818 * set up the read end of the pipe.
1819 */
1820 if (pstderr_fd)
1821 *pstderr_fd = setup_pipe_end(stderr_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001822 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001823
Elly Jonese1749eb2011-10-07 13:54:59 -04001824 return 0;
1825 }
1826 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001827
Peter Qiu2860c462015-12-16 15:13:06 -08001828 if (j->flags.reset_signal_mask) {
1829 sigset_t signal_mask;
1830 if (sigemptyset(&signal_mask) != 0)
1831 pdie("sigemptyset failed");
1832 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1833 pdie("sigprocmask failed");
1834 }
1835
Dylan Reidce5b55e2016-01-13 11:04:16 -08001836 if (sync_child)
1837 wait_for_parent_setup(child_sync_pipe_fds);
1838
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001839 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08001840 enter_user_namespace(j);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001841
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001842 /*
1843 * If we want to write to the jailed process' standard input,
1844 * set up the read end of the pipe.
1845 */
1846 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001847 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1848 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001849 die("failed to set up stdin pipe");
1850 }
1851
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001852 /*
1853 * If we want to read from the jailed process' standard output,
1854 * set up the write end of the pipe.
1855 */
1856 if (pstdout_fd) {
1857 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1858 STDOUT_FILENO) < 0)
1859 die("failed to set up stdout pipe");
1860 }
1861
1862 /*
1863 * If we want to read from the jailed process' standard error,
1864 * set up the write end of the pipe.
1865 */
1866 if (pstderr_fd) {
1867 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1868 STDERR_FILENO) < 0)
1869 die("failed to set up stderr pipe");
1870 }
1871
Dylan Reid791f5772015-09-14 20:02:42 -07001872 /* If running an init program, let it decide when/how to mount /proc. */
1873 if (pid_namespace && !do_init)
1874 j->flags.remount_proc_ro = 0;
1875
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001876 if (use_preload) {
1877 /* Strip out flags that cannot be inherited across execve(2). */
1878 minijail_preexec(j);
1879 } else {
1880 j->flags.pids = 0;
1881 }
1882 /* Jail this process, then execve() the target. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001883 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001884
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001885 if (pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001886 /*
1887 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001888 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001889 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001890 * a child to actually run the program. If |do_init == 0|, we
1891 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04001892 *
1893 * If we're multithreaded, we'll probably deadlock here. See
1894 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001895 */
1896 child_pid = fork();
1897 if (child_pid < 0)
1898 _exit(child_pid);
1899 else if (child_pid > 0)
1900 init(child_pid); /* never returns */
1901 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001902
Elly Jonesdd3e8512012-01-23 15:13:38 -05001903 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001904 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04001905 * calling process
1906 * -> execve()-ing process
1907 * If we are:
1908 * calling process
1909 * -> init()-ing process
1910 * -> execve()-ing process
1911 */
1912 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001913}
1914
Will Drewry6ac91122011-10-21 16:38:58 -05001915int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001916{
1917 int st;
1918 if (kill(j->initpid, SIGTERM))
1919 return -errno;
1920 if (waitpid(j->initpid, &st, 0) < 0)
1921 return -errno;
1922 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001923}
1924
Will Drewry6ac91122011-10-21 16:38:58 -05001925int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001926{
1927 int st;
1928 if (waitpid(j->initpid, &st, 0) < 0)
1929 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001930
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001931 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001932 int error_status = st;
1933 if (WIFSIGNALED(st)) {
1934 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001935 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001936 j->initpid, signum);
1937 /*
1938 * We return MINIJAIL_ERR_JAIL if the process received
1939 * SIGSYS, which happens when a syscall is blocked by
1940 * seccomp filters.
1941 * If not, we do what bash(1) does:
1942 * $? = 128 + signum
1943 */
1944 if (signum == SIGSYS) {
1945 error_status = MINIJAIL_ERR_JAIL;
1946 } else {
1947 error_status = 128 + signum;
1948 }
1949 }
1950 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001951 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001952
1953 int exit_status = WEXITSTATUS(st);
1954 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001955 info("child process %d exited with status %d",
1956 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001957
1958 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001959}
1960
Will Drewry6ac91122011-10-21 16:38:58 -05001961void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001962{
Dylan Reid605ce7f2016-01-19 19:21:00 -08001963 size_t i;
1964
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001965 if (j->flags.seccomp_filter && j->filter_prog) {
1966 free(j->filter_prog->filter);
1967 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001968 }
Dylan Reid648b2202015-10-23 00:50:00 -07001969 while (j->mounts_head) {
1970 struct mountpoint *m = j->mounts_head;
1971 j->mounts_head = j->mounts_head->next;
1972 free(m->type);
1973 free(m->dest);
1974 free(m->src);
1975 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001976 }
Dylan Reid648b2202015-10-23 00:50:00 -07001977 j->mounts_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001978 if (j->user)
1979 free(j->user);
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -08001980 if (j->suppl_gid_list)
1981 free(j->suppl_gid_list);
Will Drewrybee7ba72011-10-21 20:47:01 -05001982 if (j->chrootdir)
1983 free(j->chrootdir);
Andrew Brestickereac28942015-11-11 16:04:46 -08001984 if (j->alt_syscall_table)
1985 free(j->alt_syscall_table);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001986 for (i = 0; i < j->cgroup_count; ++i)
1987 free(j->cgroups[i]);
Elly Jonese1749eb2011-10-07 13:54:59 -04001988 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001989}