blob: b42ac72f463e90da1abe9e6574a107881fa8b4c3 [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07008
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08009#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050010#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040011#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070012#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <grp.h>
14#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050015#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050020#include <stdarg.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070021#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080022#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040023#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050029#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040030#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070031#include <sys/stat.h>
32#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080033#include <sys/user.h>
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -080034#include <sys/utsname.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040036#include <unistd.h>
37
38#include "libminijail.h"
39#include "libminijail-private.h"
40
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070041#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080042#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070043#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080044
Lei Zhangeee31552012-10-17 21:27:10 -070045#ifdef HAVE_SECUREBITS_H
46#include <linux/securebits.h>
47#else
48#define SECURE_ALL_BITS 0x15
49#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
50#endif
51
Will Drewry32ac9f52011-08-18 21:36:27 -050052/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080053#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070054# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080055#endif
56
Andrew Brestickereac28942015-11-11 16:04:46 -080057#ifndef PR_ALT_SYSCALL
58# define PR_ALT_SYSCALL 0x43724f53
59#endif
60
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080061/* For seccomp_filter using BPF. */
62#ifndef PR_SET_NO_NEW_PRIVS
63# define PR_SET_NO_NEW_PRIVS 38
64#endif
65#ifndef SECCOMP_MODE_FILTER
66# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050067#endif
68
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -070069#ifdef USE_SECCOMP_SOFTFAIL
70# define SECCOMP_SOFTFAIL 1
71#else
72# define SECCOMP_SOFTFAIL 0
73#endif
74
Dylan Reid605ce7f2016-01-19 19:21:00 -080075#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
76
Dylan Reid648b2202015-10-23 00:50:00 -070077struct mountpoint {
Elly Jones51a5b6c2011-10-12 19:09:26 -040078 char *src;
79 char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -070080 char *type;
81 unsigned long flags;
82 struct mountpoint *next;
Elly Jones51a5b6c2011-10-12 19:09:26 -040083};
84
Will Drewryf89aef52011-09-16 16:48:57 -050085struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070086 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070087 * WARNING: if you add a flag here you need to make sure it's
88 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070089 */
Elly Jonese1749eb2011-10-07 13:54:59 -040090 struct {
91 int uid:1;
92 int gid:1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -080093 int usergroups:1;
94 int suppl_gids:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040095 int caps:1;
96 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070097 int enter_vfs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040098 int pids:1;
Dylan Reidf7942472015-11-18 17:55:26 -080099 int ipc:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400100 int net:1;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700101 int enter_net:1;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800102 int userns:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400103 int seccomp:1;
Dylan Reid791f5772015-09-14 20:02:42 -0700104 int remount_proc_ro:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700105 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400106 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700107 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400108 int chroot:1;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800109 int pivot_root:1;
Lee Campbell11af0622014-05-22 12:36:04 -0700110 int mount_tmp:1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800111 int do_init:1;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800112 int pid_file:1;
Andrew Brestickereac28942015-11-11 16:04:46 -0800113 int alt_syscall:1;
Peter Qiu2860c462015-12-16 15:13:06 -0800114 int reset_signal_mask:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400115 } flags;
116 uid_t uid;
117 gid_t gid;
118 gid_t usergid;
119 char *user;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800120 size_t suppl_gid_count;
121 gid_t *suppl_gid_list;
Elly Jonese1749eb2011-10-07 13:54:59 -0400122 uint64_t caps;
123 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700124 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700125 int netns_fd;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400126 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800127 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800128 char *uidmap;
129 char *gidmap;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800130 size_t filter_len;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800131 struct sock_fprog *filter_prog;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800132 char *alt_syscall_table;
Dylan Reid648b2202015-10-23 00:50:00 -0700133 struct mountpoint *mounts_head;
134 struct mountpoint *mounts_tail;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800135 size_t mounts_count;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800136 char *cgroups[MAX_CGROUPS];
137 size_t cgroup_count;
Will Drewryf89aef52011-09-16 16:48:57 -0500138};
139
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700140/*
141 * Strip out flags meant for the parent.
142 * We keep things that are not inherited across execve(2) (e.g. capabilities),
143 * or are easier to set after execve(2) (e.g. seccomp filters).
144 */
145void minijail_preenter(struct minijail *j)
146{
147 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700148 j->flags.enter_vfs = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700149 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700150 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800151 j->flags.do_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800152 j->flags.pid_file = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700153}
154
155/*
156 * Strip out flags meant for the child.
157 * We keep things that are inherited across execve(2).
158 */
159void minijail_preexec(struct minijail *j)
160{
161 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700162 int enter_vfs = j->flags.enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700163 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800164 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700165 if (j->user)
166 free(j->user);
167 j->user = NULL;
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -0800168 if (j->suppl_gid_list)
169 free(j->suppl_gid_list);
170 j->suppl_gid_list = NULL;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700171 memset(&j->flags, 0, sizeof(j->flags));
172 /* Now restore anything we meant to keep. */
173 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700174 j->flags.enter_vfs = enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700175 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800176 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700177 /* Note, |pids| will already have been used before this call. */
178}
179
Jorge Lucangeli Obes272e3ab2016-01-12 21:18:59 -0800180/* Returns true if the kernel version is less than 3.8. */
181int seccomp_kernel_support_not_required()
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800182{
183 int major, minor;
184 struct utsname uts;
185 return (uname(&uts) != -1 &&
186 sscanf(uts.release, "%d.%d", &major, &minor) == 2 &&
187 ((major < 3) || ((major == 3) && (minor < 8))));
188}
189
Jorge Lucangeli Obes272e3ab2016-01-12 21:18:59 -0800190/* Allow seccomp soft-fail on Android devices with kernel version < 3.8. */
191int can_softfail()
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800192{
193#if SECCOMP_SOFTFAIL
194 if (is_android()) {
195 if (seccomp_kernel_support_not_required())
196 return 1;
197 else
198 return 0;
199 } else {
200 return 1;
201 }
202#endif
203 return 0;
204}
205
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700206/* Minijail API. */
207
Will Drewry6ac91122011-10-21 16:38:58 -0500208struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400209{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400210 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400211}
212
Will Drewry6ac91122011-10-21 16:38:58 -0500213void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400214{
215 if (uid == 0)
216 die("useless change to uid 0");
217 j->uid = uid;
218 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400219}
220
Will Drewry6ac91122011-10-21 16:38:58 -0500221void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400222{
223 if (gid == 0)
224 die("useless change to gid 0");
225 j->gid = gid;
226 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400227}
228
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800229void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
230 const gid_t *list)
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800231{
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800232 size_t i;
233
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800234 if (j->flags.usergroups)
235 die("cannot inherit *and* set supplementary groups");
236
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800237 if (size == 0) {
238 /* Clear supplementary groups. */
239 j->suppl_gid_list = NULL;
240 j->suppl_gid_count = 0;
241 j->flags.suppl_gids = 1;
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800242 return;
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800243 }
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800244
245 /* Copy the gid_t array. */
246 j->suppl_gid_list = calloc(size, sizeof(gid_t));
247 if (!j->suppl_gid_list) {
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800248 die("failed to allocate internal supplementary group array");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800249 }
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800250 for (i = 0; i < size; i++) {
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800251 j->suppl_gid_list[i] = list[i];
252 }
253 j->suppl_gid_count = size;
254 j->flags.suppl_gids = 1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800255}
256
Will Drewry6ac91122011-10-21 16:38:58 -0500257int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400258{
259 char *buf = NULL;
260 struct passwd pw;
261 struct passwd *ppw = NULL;
262 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
263 if (sz == -1)
264 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400265
Elly Jonesdd3e8512012-01-23 15:13:38 -0500266 /*
267 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400268 * the maximum needed size of the buffer, so we don't have to search.
269 */
270 buf = malloc(sz);
271 if (!buf)
272 return -ENOMEM;
273 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500274 /*
275 * We're safe to free the buffer here. The strings inside pw point
276 * inside buf, but we don't use any of them; this leaves the pointers
277 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
278 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400279 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700280 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400281 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700282 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400283 minijail_change_uid(j, ppw->pw_uid);
284 j->user = strdup(user);
285 if (!j->user)
286 return -ENOMEM;
287 j->usergid = ppw->pw_gid;
288 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400289}
290
Will Drewry6ac91122011-10-21 16:38:58 -0500291int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400292{
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -0700293 char *buf = NULL;
Yabin Cui1b21c8f2015-07-22 10:34:45 -0700294 struct group gr;
295 struct group *pgr = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400296 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
297 if (sz == -1)
298 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400299
Elly Jonesdd3e8512012-01-23 15:13:38 -0500300 /*
301 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400302 * the maximum needed size of the buffer, so we don't have to search.
303 */
304 buf = malloc(sz);
305 if (!buf)
306 return -ENOMEM;
307 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500308 /*
309 * We're safe to free the buffer here. The strings inside gr point
310 * inside buf, but we don't use any of them; this leaves the pointers
311 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
312 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400313 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700314 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400315 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700316 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400317 minijail_change_gid(j, pgr->gr_gid);
318 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400319}
320
Will Drewry6ac91122011-10-21 16:38:58 -0500321void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400322{
323 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400324}
325
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700326void API minijail_no_new_privs(struct minijail *j)
327{
328 j->flags.no_new_privs = 1;
329}
330
Will Drewry6ac91122011-10-21 16:38:58 -0500331void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400332{
333 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500334}
335
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700336void API minijail_log_seccomp_filter_failures(struct minijail *j)
337{
338 j->flags.log_seccomp_filter = 1;
339}
340
Will Drewry6ac91122011-10-21 16:38:58 -0500341void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400342{
343 j->caps = capmask;
344 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400345}
346
Peter Qiu2860c462015-12-16 15:13:06 -0800347void API minijail_reset_signal_mask(struct minijail* j) {
348 j->flags.reset_signal_mask = 1;
349}
350
Will Drewry6ac91122011-10-21 16:38:58 -0500351void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400352{
353 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400354}
355
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700356void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
357{
358 int ns_fd = open(ns_path, O_RDONLY);
359 if (ns_fd < 0) {
360 pdie("failed to open namespace '%s'", ns_path);
361 }
362 j->mountns_fd = ns_fd;
363 j->flags.enter_vfs = 1;
364}
365
Will Drewry6ac91122011-10-21 16:38:58 -0500366void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400367{
Elly Jonese58176c2012-01-23 11:46:17 -0500368 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700369 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400370 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800371 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400372}
373
Dylan Reidf7942472015-11-18 17:55:26 -0800374void API minijail_namespace_ipc(struct minijail *j)
375{
376 j->flags.ipc = 1;
377}
378
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400379void API minijail_namespace_net(struct minijail *j)
380{
381 j->flags.net = 1;
382}
383
Dylan Reid1102f5a2015-09-15 11:52:20 -0700384void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
385{
386 int ns_fd = open(ns_path, O_RDONLY);
387 if (ns_fd < 0) {
388 pdie("failed to open namespace '%s'", ns_path);
389 }
390 j->netns_fd = ns_fd;
391 j->flags.enter_net = 1;
392}
393
Dylan Reid791f5772015-09-14 20:02:42 -0700394void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400395{
396 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700397 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400398}
399
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800400void API minijail_namespace_user(struct minijail *j)
401{
402 j->flags.userns = 1;
403}
404
405int API minijail_uidmap(struct minijail *j, const char *uidmap)
406{
407 j->uidmap = strdup(uidmap);
408 if (!j->uidmap)
409 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800410 char *ch;
411 for (ch = j->uidmap; *ch; ch++) {
412 if (*ch == ',')
413 *ch = '\n';
414 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800415 return 0;
416}
417
418int API minijail_gidmap(struct minijail *j, const char *gidmap)
419{
420 j->gidmap = strdup(gidmap);
421 if (!j->gidmap)
422 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800423 char *ch;
424 for (ch = j->gidmap; *ch; ch++) {
425 if (*ch == ',')
426 *ch = '\n';
427 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800428 return 0;
429}
430
Will Drewry6ac91122011-10-21 16:38:58 -0500431void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400432{
433 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400434}
435
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800436void API minijail_run_as_init(struct minijail *j)
437{
438 /*
439 * Since the jailed program will become 'init' in the new PID namespace,
440 * Minijail does not need to fork an 'init' process.
441 */
442 j->flags.do_init = 0;
443}
444
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700445int API minijail_enter_chroot(struct minijail *j, const char *dir)
446{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400447 if (j->chrootdir)
448 return -EINVAL;
449 j->chrootdir = strdup(dir);
450 if (!j->chrootdir)
451 return -ENOMEM;
452 j->flags.chroot = 1;
453 return 0;
454}
455
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800456int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
457{
458 if (j->chrootdir)
459 return -EINVAL;
460 j->chrootdir = strdup(dir);
461 if (!j->chrootdir)
462 return -ENOMEM;
463 j->flags.pivot_root = 1;
464 return 0;
465}
466
Dylan Reida14e08d2015-10-22 21:05:29 -0700467static char *append_external_path(const char *external_path,
468 const char *path_inside_chroot)
Dylan Reid08946cc2015-09-16 19:10:57 -0700469{
Dylan Reida14e08d2015-10-22 21:05:29 -0700470 char *path;
Dylan Reid08946cc2015-09-16 19:10:57 -0700471 size_t pathlen;
472
Dylan Reid08946cc2015-09-16 19:10:57 -0700473 /* One extra char for '/' and one for '\0', hence + 2. */
Dylan Reida14e08d2015-10-22 21:05:29 -0700474 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
475 path = malloc(pathlen);
476 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700477
Dylan Reida14e08d2015-10-22 21:05:29 -0700478 return path;
479}
480
481char API *minijail_get_original_path(struct minijail *j,
482 const char *path_inside_chroot)
483{
Dylan Reid648b2202015-10-23 00:50:00 -0700484 struct mountpoint *b;
Dylan Reida14e08d2015-10-22 21:05:29 -0700485
Dylan Reid648b2202015-10-23 00:50:00 -0700486 b = j->mounts_head;
Dylan Reida14e08d2015-10-22 21:05:29 -0700487 while (b) {
488 /*
489 * If |path_inside_chroot| is the exact destination of a
Dylan Reid648b2202015-10-23 00:50:00 -0700490 * mount, then the original path is exactly the source of
491 * the mount.
Dylan Reida14e08d2015-10-22 21:05:29 -0700492 * for example: "-b /some/path/exe,/chroot/path/exe"
Dylan Reid648b2202015-10-23 00:50:00 -0700493 * mount source = /some/path/exe, mount dest =
494 * /chroot/path/exe Then when getting the original path of
495 * "/chroot/path/exe", the source of that mount,
496 * "/some/path/exe" is what should be returned.
Dylan Reida14e08d2015-10-22 21:05:29 -0700497 */
498 if (!strcmp(b->dest, path_inside_chroot))
499 return strdup(b->src);
500
501 /*
502 * If |path_inside_chroot| is within the destination path of a
Dylan Reid648b2202015-10-23 00:50:00 -0700503 * mount, take the suffix of the chroot path relative to the
504 * mount destination path, and append it to the mount source
505 * path.
Dylan Reida14e08d2015-10-22 21:05:29 -0700506 */
507 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
508 const char *relative_path =
509 path_inside_chroot + strlen(b->dest);
510 return append_external_path(b->src, relative_path);
511 }
512 b = b->next;
513 }
514
515 /* If there is a chroot path, append |path_inside_chroot| to that. */
516 if (j->chrootdir)
517 return append_external_path(j->chrootdir, path_inside_chroot);
518
519 /* No chroot, so the path outside is the same as it is inside. */
520 return strdup(path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700521}
522
Lee Campbell11af0622014-05-22 12:36:04 -0700523void API minijail_mount_tmp(struct minijail *j)
524{
525 j->flags.mount_tmp = 1;
526}
527
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800528int API minijail_write_pid_file(struct minijail *j, const char *path)
529{
530 j->pid_file_path = strdup(path);
531 if (!j->pid_file_path)
532 return -ENOMEM;
533 j->flags.pid_file = 1;
534 return 0;
535}
536
Dylan Reid605ce7f2016-01-19 19:21:00 -0800537int API minijail_add_to_cgroup(struct minijail *j, const char *path)
538{
539 if (j->cgroup_count >= MAX_CGROUPS)
540 return -ENOMEM;
541 j->cgroups[j->cgroup_count] = strdup(path);
542 if (!j->cgroups[j->cgroup_count])
543 return -ENOMEM;
544 j->cgroup_count++;
545 return 0;
546}
547
Dylan Reid648b2202015-10-23 00:50:00 -0700548int API minijail_mount(struct minijail *j, const char *src, const char *dest,
549 const char *type, unsigned long flags)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700550{
Dylan Reid648b2202015-10-23 00:50:00 -0700551 struct mountpoint *m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400552
553 if (*dest != '/')
554 return -EINVAL;
Dylan Reid648b2202015-10-23 00:50:00 -0700555 m = calloc(1, sizeof(*m));
556 if (!m)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400557 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700558 m->dest = strdup(dest);
559 if (!m->dest)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400560 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700561 m->src = strdup(src);
562 if (!m->src)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400563 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700564 m->type = strdup(type);
565 if (!m->type)
566 goto error;
567 m->flags = flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400568
Dylan Reid648b2202015-10-23 00:50:00 -0700569 info("mount %s -> %s type %s", src, dest, type);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400570
Elly Jonesdd3e8512012-01-23 15:13:38 -0500571 /*
Dylan Reid648b2202015-10-23 00:50:00 -0700572 * Force vfs namespacing so the mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400573 * containing vfs namespace.
574 */
575 minijail_namespace_vfs(j);
576
Dylan Reid648b2202015-10-23 00:50:00 -0700577 if (j->mounts_tail)
578 j->mounts_tail->next = m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400579 else
Dylan Reid648b2202015-10-23 00:50:00 -0700580 j->mounts_head = m;
581 j->mounts_tail = m;
582 j->mounts_count++;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400583
584 return 0;
585
586error:
Dylan Reid648b2202015-10-23 00:50:00 -0700587 free(m->src);
588 free(m->dest);
589 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400590 return -ENOMEM;
591}
592
Dylan Reid648b2202015-10-23 00:50:00 -0700593int API minijail_bind(struct minijail *j, const char *src, const char *dest,
594 int writeable)
595{
596 unsigned long flags = MS_BIND;
597
598 if (!writeable)
599 flags |= MS_RDONLY;
600
601 return minijail_mount(j, src, dest, "", flags);
602}
603
Will Drewry6ac91122011-10-21 16:38:58 -0500604void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400605{
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700606 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800607 if ((errno == EINVAL) && can_softfail()) {
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -0800608 warn("not loading seccomp filter,"
609 " seccomp not supported");
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -0800610 j->flags.seccomp_filter = 0;
611 j->flags.log_seccomp_filter = 0;
612 j->filter_len = 0;
613 j->filter_prog = NULL;
614 j->flags.no_new_privs = 0;
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700615 }
616 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400617 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800618 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700619 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400620 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800621
622 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700623 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
624 die("failed to compile seccomp filter BPF program in '%s'",
625 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800626 }
627
628 j->filter_len = fprog->len;
629 j->filter_prog = fprog;
630
Elly Jonese1749eb2011-10-07 13:54:59 -0400631 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500632}
633
Andrew Brestickereac28942015-11-11 16:04:46 -0800634int API minijail_use_alt_syscall(struct minijail *j, const char *table)
635{
636 j->alt_syscall_table = strdup(table);
637 if (!j->alt_syscall_table)
638 return -ENOMEM;
639 j->flags.alt_syscall = 1;
640 return 0;
641}
642
Will Drewryf89aef52011-09-16 16:48:57 -0500643struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400644 size_t available;
645 size_t total;
646 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500647};
648
Will Drewry6ac91122011-10-21 16:38:58 -0500649void marshal_state_init(struct marshal_state *state,
650 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400651{
652 state->available = available;
653 state->buf = buf;
654 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500655}
656
Will Drewry6ac91122011-10-21 16:38:58 -0500657void marshal_append(struct marshal_state *state,
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800658 void *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400659{
660 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500661
Elly Jonese1749eb2011-10-07 13:54:59 -0400662 /* Up to |available| will be written. */
663 if (copy_len) {
664 memcpy(state->buf, src, copy_len);
665 state->buf += copy_len;
666 state->available -= copy_len;
667 }
668 /* |total| will contain the expected length. */
669 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500670}
671
Will Drewry6ac91122011-10-21 16:38:58 -0500672void minijail_marshal_helper(struct marshal_state *state,
673 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400674{
Dylan Reid648b2202015-10-23 00:50:00 -0700675 struct mountpoint *m = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800676 size_t i;
677
Elly Jonese1749eb2011-10-07 13:54:59 -0400678 marshal_append(state, (char *)j, sizeof(*j));
679 if (j->user)
680 marshal_append(state, j->user, strlen(j->user) + 1);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800681 if (j->suppl_gid_list) {
682 marshal_append(state, j->suppl_gid_list,
683 j->suppl_gid_count * sizeof(gid_t));
684 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400685 if (j->chrootdir)
686 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Andrew Brestickereac28942015-11-11 16:04:46 -0800687 if (j->alt_syscall_table) {
688 marshal_append(state, j->alt_syscall_table,
689 strlen(j->alt_syscall_table) + 1);
690 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800691 if (j->flags.seccomp_filter && j->filter_prog) {
692 struct sock_fprog *fp = j->filter_prog;
693 marshal_append(state, (char *)fp->filter,
694 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400695 }
Dylan Reid648b2202015-10-23 00:50:00 -0700696 for (m = j->mounts_head; m; m = m->next) {
697 marshal_append(state, m->src, strlen(m->src) + 1);
698 marshal_append(state, m->dest, strlen(m->dest) + 1);
699 marshal_append(state, m->type, strlen(m->type) + 1);
700 marshal_append(state, (char *)&m->flags, sizeof(m->flags));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400701 }
Dylan Reid605ce7f2016-01-19 19:21:00 -0800702 for (i = 0; i < j->cgroup_count; ++i)
703 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
Will Drewryf89aef52011-09-16 16:48:57 -0500704}
705
Will Drewry6ac91122011-10-21 16:38:58 -0500706size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400707{
708 struct marshal_state state;
709 marshal_state_init(&state, NULL, 0);
710 minijail_marshal_helper(&state, j);
711 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500712}
713
Elly Jonese1749eb2011-10-07 13:54:59 -0400714int minijail_marshal(const struct minijail *j, char *buf, size_t available)
715{
716 struct marshal_state state;
717 marshal_state_init(&state, buf, available);
718 minijail_marshal_helper(&state, j);
719 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500720}
721
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -0800722/*
723 * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
Elly Jones51a5b6c2011-10-12 19:09:26 -0400724 * @length Number of bytes to consume
725 * @buf Buffer to consume from
726 * @buflength Size of @buf
727 *
728 * Returns a pointer to the base of the bytes, or NULL for errors.
729 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700730void *consumebytes(size_t length, char **buf, size_t *buflength)
731{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400732 char *p = *buf;
733 if (length > *buflength)
734 return NULL;
735 *buf += length;
736 *buflength -= length;
737 return p;
738}
739
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -0800740/*
741 * consumestr: consumes a C string from a buffer @buf of length @length
Elly Jones51a5b6c2011-10-12 19:09:26 -0400742 * @buf Buffer to consume
743 * @length Length of buffer
744 *
745 * Returns a pointer to the base of the string, or NULL for errors.
746 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700747char *consumestr(char **buf, size_t *buflength)
748{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400749 size_t len = strnlen(*buf, *buflength);
750 if (len == *buflength)
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -0700751 /* There's no null-terminator. */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400752 return NULL;
753 return consumebytes(len + 1, buf, buflength);
754}
755
Elly Jonese1749eb2011-10-07 13:54:59 -0400756int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
757{
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800758 size_t i;
759 size_t count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500760 int ret = -EINVAL;
761
Elly Jonese1749eb2011-10-07 13:54:59 -0400762 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500763 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400764 memcpy((void *)j, serialized, sizeof(*j));
765 serialized += sizeof(*j);
766 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500767
Will Drewrybee7ba72011-10-21 20:47:01 -0500768 /* Potentially stale pointers not used as signals. */
Dylan Reid648b2202015-10-23 00:50:00 -0700769 j->mounts_head = NULL;
770 j->mounts_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800771 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500772
Elly Jonese1749eb2011-10-07 13:54:59 -0400773 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400774 char *user = consumestr(&serialized, &length);
775 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500776 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400777 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500778 if (!j->user)
779 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400780 }
Will Drewryf89aef52011-09-16 16:48:57 -0500781
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800782 if (j->suppl_gid_list) { /* stale pointer */
783 if (j->suppl_gid_count > NGROUPS_MAX) {
784 goto bad_gid_list;
785 }
786 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
787 void *gid_list_bytes =
788 consumebytes(gid_list_size, &serialized, &length);
789 if (!gid_list_bytes)
790 goto bad_gid_list;
791
792 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
793 if (!j->suppl_gid_list)
794 goto bad_gid_list;
795
796 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
797 }
798
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400799 if (j->chrootdir) { /* stale pointer */
800 char *chrootdir = consumestr(&serialized, &length);
801 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500802 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400803 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500804 if (!j->chrootdir)
805 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400806 }
807
Andrew Brestickereac28942015-11-11 16:04:46 -0800808 if (j->alt_syscall_table) { /* stale pointer */
809 char *alt_syscall_table = consumestr(&serialized, &length);
810 if (!alt_syscall_table)
811 goto bad_syscall_table;
812 j->alt_syscall_table = strdup(alt_syscall_table);
813 if (!j->alt_syscall_table)
814 goto bad_syscall_table;
815 }
816
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800817 if (j->flags.seccomp_filter && j->filter_len > 0) {
818 size_t ninstrs = j->filter_len;
819 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
820 ninstrs > USHRT_MAX)
821 goto bad_filters;
822
823 size_t program_len = ninstrs * sizeof(struct sock_filter);
824 void *program = consumebytes(program_len, &serialized, &length);
825 if (!program)
826 goto bad_filters;
827
828 j->filter_prog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800829 if (!j->filter_prog)
830 goto bad_filters;
831
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800832 j->filter_prog->len = ninstrs;
833 j->filter_prog->filter = malloc(program_len);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800834 if (!j->filter_prog->filter)
835 goto bad_filter_prog_instrs;
836
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800837 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400838 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400839
Dylan Reid648b2202015-10-23 00:50:00 -0700840 count = j->mounts_count;
841 j->mounts_count = 0;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400842 for (i = 0; i < count; ++i) {
Dylan Reid648b2202015-10-23 00:50:00 -0700843 unsigned long *flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400844 const char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -0700845 const char *type;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400846 const char *src = consumestr(&serialized, &length);
847 if (!src)
Dylan Reid648b2202015-10-23 00:50:00 -0700848 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400849 dest = consumestr(&serialized, &length);
850 if (!dest)
Dylan Reid648b2202015-10-23 00:50:00 -0700851 goto bad_mounts;
852 type = consumestr(&serialized, &length);
853 if (!type)
854 goto bad_mounts;
855 flags = consumebytes(sizeof(*flags), &serialized, &length);
856 if (!flags)
857 goto bad_mounts;
858 if (minijail_mount(j, src, dest, type, *flags))
859 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400860 }
861
Dylan Reid605ce7f2016-01-19 19:21:00 -0800862 count = j->cgroup_count;
863 j->cgroup_count = 0;
864 for (i = 0; i < count; ++i) {
865 char *cgroup = consumestr(&serialized, &length);
866 if (!cgroup)
867 goto bad_cgroups;
868 j->cgroups[i] = strdup(cgroup);
869 if (!j->cgroups[i])
870 goto bad_cgroups;
871 ++j->cgroup_count;
872 }
873
Elly Jonese1749eb2011-10-07 13:54:59 -0400874 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500875
Dylan Reid605ce7f2016-01-19 19:21:00 -0800876bad_cgroups:
877 while (j->mounts_head) {
878 struct mountpoint *m = j->mounts_head;
879 j->mounts_head = j->mounts_head->next;
880 free(m->type);
881 free(m->dest);
882 free(m->src);
883 free(m);
884 }
885 for (i = 0; i < j->cgroup_count; ++i)
886 free(j->cgroups[i]);
Dylan Reid648b2202015-10-23 00:50:00 -0700887bad_mounts:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800888 if (j->flags.seccomp_filter && j->filter_len > 0) {
889 free(j->filter_prog->filter);
890 free(j->filter_prog);
891 }
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800892bad_filter_prog_instrs:
893 if (j->filter_prog)
894 free(j->filter_prog);
Will Drewrybee7ba72011-10-21 20:47:01 -0500895bad_filters:
Andrew Brestickereac28942015-11-11 16:04:46 -0800896 if (j->alt_syscall_table)
897 free(j->alt_syscall_table);
898bad_syscall_table:
Will Drewrybee7ba72011-10-21 20:47:01 -0500899 if (j->chrootdir)
900 free(j->chrootdir);
901bad_chrootdir:
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800902 if (j->suppl_gid_list)
903 free(j->suppl_gid_list);
904bad_gid_list:
Will Drewrybee7ba72011-10-21 20:47:01 -0500905 if (j->user)
906 free(j->user);
907clear_pointers:
908 j->user = NULL;
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800909 j->suppl_gid_list = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500910 j->chrootdir = NULL;
Andrew Brestickereac28942015-11-11 16:04:46 -0800911 j->alt_syscall_table = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800912 j->cgroup_count = 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500913out:
914 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500915}
916
Dylan Reidce5b55e2016-01-13 11:04:16 -0800917static void write_ugid_mappings(const struct minijail *j)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800918{
919 int fd, ret, len;
920 size_t sz;
921 char fname[32];
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800922
923 sz = sizeof(fname);
924 if (j->uidmap) {
925 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700926 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800927 die("failed to write file name of uid_map");
928 fd = open(fname, O_WRONLY);
929 if (fd < 0)
930 pdie("failed to open '%s'", fname);
931 len = strlen(j->uidmap);
932 if (write(fd, j->uidmap, len) < len)
933 die("failed to set uid_map");
934 close(fd);
935 }
936 if (j->gidmap) {
937 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700938 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800939 die("failed to write file name of gid_map");
940 fd = open(fname, O_WRONLY);
941 if (fd < 0)
942 pdie("failed to open '%s'", fname);
943 len = strlen(j->gidmap);
944 if (write(fd, j->gidmap, len) < len)
945 die("failed to set gid_map");
946 close(fd);
947 }
Dylan Reidce5b55e2016-01-13 11:04:16 -0800948}
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800949
Dylan Reidce5b55e2016-01-13 11:04:16 -0800950static void parent_setup_complete(int *pipe_fds)
951{
952 close(pipe_fds[0]);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800953 close(pipe_fds[1]);
954}
955
Dylan Reidce5b55e2016-01-13 11:04:16 -0800956/*
957 * wait_for_parent_setup: Called by the child process to wait for any
958 * further parent-side setup to complete before continuing.
959 */
960static void wait_for_parent_setup(int *pipe_fds)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800961{
962 char buf;
963
964 close(pipe_fds[1]);
965
Dylan Reidce5b55e2016-01-13 11:04:16 -0800966 /* Wait for parent to complete setup and close the pipe. */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800967 if (read(pipe_fds[0], &buf, 1) != 0)
968 die("failed to sync with parent");
969 close(pipe_fds[0]);
Dylan Reidce5b55e2016-01-13 11:04:16 -0800970}
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800971
Dylan Reidce5b55e2016-01-13 11:04:16 -0800972static void enter_user_namespace(const struct minijail *j)
973{
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800974 if (j->uidmap && setresuid(0, 0, 0))
975 pdie("setresuid");
976 if (j->gidmap && setresgid(0, 0, 0))
977 pdie("setresgid");
978}
979
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -0800980/*
981 * mount_one: Applies mounts from @m for @j, recursing as needed.
Dylan Reid648b2202015-10-23 00:50:00 -0700982 * @j Minijail these mounts are for
983 * @m Head of list of mounts
Elly Jones51a5b6c2011-10-12 19:09:26 -0400984 *
985 * Returns 0 for success.
986 */
Dylan Reid648b2202015-10-23 00:50:00 -0700987static int mount_one(const struct minijail *j, struct mountpoint *m)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700988{
Dylan Reid648b2202015-10-23 00:50:00 -0700989 int ret;
990 char *dest;
991 int remount_ro = 0;
992
Elly Jones51a5b6c2011-10-12 19:09:26 -0400993 /* dest has a leading "/" */
Dylan Reid648b2202015-10-23 00:50:00 -0700994 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400995 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700996
997 /*
998 * R/O bind mounts have to be remounted since bind and ro can't both be
999 * specified in the original bind mount. Remount R/O after the initial
1000 * mount.
1001 */
1002 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1003 remount_ro = 1;
1004 m->flags &= ~MS_RDONLY;
Elly Jonesa1059632011-12-15 15:17:07 -05001005 }
Dylan Reid648b2202015-10-23 00:50:00 -07001006
1007 ret = mount(m->src, dest, m->type, m->flags, NULL);
1008 if (ret)
1009 pdie("mount: %s -> %s", m->src, dest);
1010
1011 if (remount_ro) {
1012 m->flags |= MS_RDONLY;
1013 ret = mount(m->src, dest, NULL,
1014 m->flags | MS_REMOUNT, NULL);
1015 if (ret)
1016 pdie("bind ro: %s -> %s", m->src, dest);
1017 }
1018
Elly Jones51a5b6c2011-10-12 19:09:26 -04001019 free(dest);
Dylan Reid648b2202015-10-23 00:50:00 -07001020 if (m->next)
1021 return mount_one(j, m->next);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001022 return ret;
1023}
1024
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001025int enter_chroot(const struct minijail *j)
1026{
Elly Jones51a5b6c2011-10-12 19:09:26 -04001027 int ret;
Dylan Reid648b2202015-10-23 00:50:00 -07001028
1029 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
Elly Jones51a5b6c2011-10-12 19:09:26 -04001030 return ret;
1031
1032 if (chroot(j->chrootdir))
1033 return -errno;
1034
1035 if (chdir("/"))
1036 return -errno;
1037
1038 return 0;
1039}
1040
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001041int enter_pivot_root(const struct minijail *j)
1042{
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001043 int ret, oldroot, newroot;
Dylan Reid648b2202015-10-23 00:50:00 -07001044
1045 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001046 return ret;
1047
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001048 /*
1049 * Keep the fd for both old and new root.
1050 * It will be used in fchdir later.
1051 */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001052 oldroot = open("/", O_DIRECTORY | O_RDONLY);
1053 if (oldroot < 0)
1054 pdie("failed to open / for fchdir");
1055 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
1056 if (newroot < 0)
1057 pdie("failed to open %s for fchdir", j->chrootdir);
1058
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001059 /*
1060 * To ensure chrootdir is the root of a file system,
1061 * do a self bind mount.
1062 */
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001063 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1064 pdie("failed to bind mount '%s'", j->chrootdir);
1065 if (chdir(j->chrootdir))
1066 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001067 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001068 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001069
1070 /*
1071 * Now the old root is mounted on top of the new root. Use fchdir to
1072 * change to the old root and unmount it.
1073 */
1074 if (fchdir(oldroot))
1075 pdie("failed to fchdir to old /");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001076 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001077 if (umount2(".", MNT_DETACH))
1078 pdie("umount(/)");
1079 /* Change back to the new root. */
1080 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001081 return -errno;
1082 if (chroot("/"))
1083 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -07001084 /* Set correct CWD for getcwd(3). */
1085 if (chdir("/"))
1086 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001087
1088 return 0;
1089}
1090
Lee Campbell11af0622014-05-22 12:36:04 -07001091int mount_tmp(void)
1092{
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001093 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
Lee Campbell11af0622014-05-22 12:36:04 -07001094}
1095
Dylan Reid791f5772015-09-14 20:02:42 -07001096int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001097{
1098 const char *kProcPath = "/proc";
1099 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -05001100 /*
1101 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -04001102 * /proc in our namespace, which means using MS_REMOUNT here would
1103 * mutate our parent's mount as well, even though we're in a VFS
1104 * namespace (!). Instead, remove their mount from our namespace
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001105 * and make our own. However, if we are in a new user namespace, /proc
1106 * is not seen as mounted, so don't return error if umount() fails.
Elly Jonese1749eb2011-10-07 13:54:59 -04001107 */
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001108 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
Elly Jonese1749eb2011-10-07 13:54:59 -04001109 return -errno;
1110 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1111 return -errno;
1112 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -04001113}
1114
Dylan Reid605ce7f2016-01-19 19:21:00 -08001115static void write_pid_to_path(pid_t pid, const char *path)
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001116{
Dylan Reid605ce7f2016-01-19 19:21:00 -08001117 FILE *fp = fopen(path, "w");
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001118
1119 if (!fp)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001120 pdie("failed to open '%s'", path);
1121 if (fprintf(fp, "%d\n", (int)pid) < 0)
1122 pdie("fprintf(%s)", path);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001123 if (fclose(fp))
Dylan Reid605ce7f2016-01-19 19:21:00 -08001124 pdie("fclose(%s)", path);
1125}
1126
1127static void write_pid_file(const struct minijail *j)
1128{
1129 write_pid_to_path(j->initpid, j->pid_file_path);
1130}
1131
1132static void assign_cgroups(const struct minijail *j)
1133{
1134 size_t i;
1135
1136 for (i = 0; i < j->cgroup_count; ++i)
1137 write_pid_to_path(j->initpid, j->cgroups[i]);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001138}
1139
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001140void drop_ugid(const struct minijail *j)
1141{
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001142 if (j->flags.usergroups && j->flags.suppl_gids) {
1143 die("tried to inherit *and* set supplementary groups;"
1144 " can only do one");
1145 }
1146
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001147 if (j->flags.usergroups) {
1148 if (initgroups(j->user, j->usergid))
1149 pdie("initgroups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001150 } else if (j->flags.suppl_gids) {
1151 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1152 pdie("setgroups");
1153 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001154 } else {
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001155 /*
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001156 * Only attempt to clear supplementary groups if we are changing
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001157 * users.
1158 */
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001159 if ((j->uid || j->gid) && setgroups(0, NULL))
1160 pdie("setgroups");
1161 }
1162
1163 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1164 pdie("setresgid");
1165
1166 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1167 pdie("setresuid");
1168}
1169
Mike Frysinger3adfef72013-05-09 17:19:08 -04001170/*
1171 * We specifically do not use cap_valid() as that only tells us the last
1172 * valid cap we were *compiled* against (i.e. what the version of kernel
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001173 * headers says). If we run on a different kernel version, then it's not
Mike Frysinger3adfef72013-05-09 17:19:08 -04001174 * uncommon for that to be less (if an older kernel) or more (if a newer
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001175 * kernel).
1176 * Normally, we suck up the answer via /proc. On Android, not all processes are
1177 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1178 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
Mike Frysinger3adfef72013-05-09 17:19:08 -04001179 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -07001180static unsigned int get_last_valid_cap()
Mike Frysinger3adfef72013-05-09 17:19:08 -04001181{
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001182 unsigned int last_valid_cap = 0;
1183 if (is_android()) {
1184 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1185 ++last_valid_cap);
Mike Frysinger3adfef72013-05-09 17:19:08 -04001186
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001187 /* |last_valid_cap| will be the first failing value. */
1188 if (last_valid_cap > 0) {
1189 last_valid_cap--;
1190 }
1191 } else {
1192 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1193 FILE *fp = fopen(cap_file, "re");
1194 if (fscanf(fp, "%u", &last_valid_cap) != 1)
1195 pdie("fscanf(%s)", cap_file);
1196 fclose(fp);
1197 }
Dylan Reidf682d472015-09-17 21:39:07 -07001198 return last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -04001199}
1200
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -07001201void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -04001202{
1203 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -08001204 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -08001205 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -04001206 unsigned int i;
1207 if (!caps)
1208 die("can't get process caps");
1209 if (cap_clear_flag(caps, CAP_INHERITABLE))
1210 die("can't clear inheritable caps");
1211 if (cap_clear_flag(caps, CAP_EFFECTIVE))
1212 die("can't clear effective caps");
1213 if (cap_clear_flag(caps, CAP_PERMITTED))
1214 die("can't clear permitted caps");
Dylan Reidf682d472015-09-17 21:39:07 -07001215 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -08001216 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001217 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -04001218 continue;
Kees Cook323878a2013-02-05 15:35:24 -08001219 flag[0] = i;
1220 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001221 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -08001222 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001223 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -08001224 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001225 die("can't add inheritable cap");
1226 }
1227 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -08001228 die("can't apply initial cleaned capset");
1229
1230 /*
1231 * Instead of dropping bounding set first, do it here in case
1232 * the caller had a more permissive bounding set which could
1233 * have been used above to raise a capability that wasn't already
1234 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1235 */
Dylan Reidf682d472015-09-17 21:39:07 -07001236 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -08001237 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -04001238 continue;
1239 if (prctl(PR_CAPBSET_DROP, i))
1240 pdie("prctl(PR_CAPBSET_DROP)");
1241 }
Kees Cook323878a2013-02-05 15:35:24 -08001242
1243 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001244 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -08001245 flag[0] = CAP_SETPCAP;
1246 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1247 die("can't clear effective cap");
1248 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1249 die("can't clear permitted cap");
1250 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1251 die("can't clear inheritable cap");
1252 }
1253
1254 if (cap_set_proc(caps))
1255 die("can't apply final cleaned capset");
1256
1257 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -04001258}
1259
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001260void set_seccomp_filter(const struct minijail *j)
1261{
1262 /*
1263 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1264 * in the kernel source tree for an explanation of the parameters.
1265 */
1266 if (j->flags.no_new_privs) {
1267 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1268 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1269 }
1270
1271 /*
1272 * If we're logging seccomp filter failures,
1273 * install the SIGSYS handler first.
1274 */
1275 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1276 if (install_sigsys_handler())
1277 pdie("install SIGSYS handler");
1278 warn("logging seccomp filter failures");
1279 }
1280
1281 /*
1282 * Install the syscall filter.
1283 */
1284 if (j->flags.seccomp_filter) {
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001285 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1286 j->filter_prog)) {
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -08001287 if ((errno == EINVAL) && can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001288 warn("seccomp not supported");
1289 return;
1290 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001291 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001292 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001293 }
1294}
1295
Will Drewry6ac91122011-10-21 16:38:58 -05001296void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001297{
Dylan Reidf682d472015-09-17 21:39:07 -07001298 /*
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001299 * If we're dropping caps, get the last valid cap from /proc now,
1300 * since /proc can be unmounted before drop_caps() is called.
Dylan Reidf682d472015-09-17 21:39:07 -07001301 */
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001302 unsigned int last_valid_cap = 0;
1303 if (j->flags.caps)
1304 last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -07001305
Elly Jonese1749eb2011-10-07 13:54:59 -04001306 if (j->flags.pids)
1307 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001308 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04001309
Elly Jonese1749eb2011-10-07 13:54:59 -04001310 if (j->flags.usergroups && !j->user)
1311 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -04001312
Elly Jonesdd3e8512012-01-23 15:13:38 -05001313 /*
1314 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04001315 * so we don't even try. If any of our operations fail, we abort() the
1316 * entire process.
1317 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001318 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1319 pdie("setns(CLONE_NEWNS)");
1320
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001321 if (j->flags.vfs) {
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001322 if (unshare(CLONE_NEWNS))
1323 pdie("unshare(vfs)");
1324 /*
1325 * Remount all filesystems as private. If they are shared
1326 * new bind mounts will creep out of our namespace.
1327 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1328 */
1329 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1330 pdie("mount(/, private)");
1331 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001332
Dylan Reidf7942472015-11-18 17:55:26 -08001333 if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1334 pdie("unshare(ipc)");
1335 }
1336
Dylan Reid1102f5a2015-09-15 11:52:20 -07001337 if (j->flags.enter_net) {
1338 if (setns(j->netns_fd, CLONE_NEWNET))
1339 pdie("setns(CLONE_NEWNET)");
1340 } else if (j->flags.net && unshare(CLONE_NEWNET)) {
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001341 pdie("unshare(net)");
Dylan Reid1102f5a2015-09-15 11:52:20 -07001342 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001343
Elly Jones51a5b6c2011-10-12 19:09:26 -04001344 if (j->flags.chroot && enter_chroot(j))
1345 pdie("chroot");
1346
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001347 if (j->flags.pivot_root && enter_pivot_root(j))
1348 pdie("pivot_root");
1349
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001350 if (j->flags.mount_tmp && mount_tmp())
Lee Campbell11af0622014-05-22 12:36:04 -07001351 pdie("mount_tmp");
1352
Dylan Reid791f5772015-09-14 20:02:42 -07001353 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04001354 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04001355
Elly Jonese1749eb2011-10-07 13:54:59 -04001356 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001357 /*
1358 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -04001359 * capability to change uids, our attempt to use setuid()
1360 * below will fail. Hang on to root caps across setuid(), then
1361 * lock securebits.
1362 */
1363 if (prctl(PR_SET_KEEPCAPS, 1))
1364 pdie("prctl(PR_SET_KEEPCAPS)");
1365 if (prctl
1366 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1367 pdie("prctl(PR_SET_SECUREBITS)");
1368 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001369
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001370 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001371 * If we're setting no_new_privs, we can drop privileges
1372 * before setting seccomp filter. This way filter policies
1373 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001374 */
1375 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001376 drop_ugid(j);
1377 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001378 drop_caps(j, last_valid_cap);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001379
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001380 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04001381 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001382 /*
1383 * If we're not setting no_new_privs,
1384 * we need to set seccomp filter *before* dropping privileges.
1385 * WARNING: this means that filter policies *must* allow
1386 * setgroups()/setresgid()/setresuid() for dropping root and
1387 * capget()/capset()/prctl() for dropping caps.
1388 */
1389 set_seccomp_filter(j);
1390
1391 drop_ugid(j);
1392 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001393 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04001394 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001395
Elly Jonesdd3e8512012-01-23 15:13:38 -05001396 /*
Andrew Brestickereac28942015-11-11 16:04:46 -08001397 * Select the specified alternate syscall table. The table must not
1398 * block prctl(2) if we're using seccomp as well.
1399 */
1400 if (j->flags.alt_syscall) {
1401 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1402 pdie("prctl(PR_ALT_SYSCALL)");
1403 }
1404
1405 /*
Elly Jonesdd3e8512012-01-23 15:13:38 -05001406 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04001407 * privilege-dropping syscalls :)
1408 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001409 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
Jeff Vander Stoep2885bef2016-01-11 15:22:42 -08001410 if ((errno == EINVAL) && can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001411 warn("seccomp not supported");
1412 return;
1413 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001414 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001415 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001416}
1417
Will Drewry6ac91122011-10-21 16:38:58 -05001418/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04001419static int init_exitstatus = 0;
1420
Will Drewry6ac91122011-10-21 16:38:58 -05001421void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -04001422{
1423 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04001424}
1425
Will Drewry6ac91122011-10-21 16:38:58 -05001426int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04001427{
1428 pid_t pid;
1429 int status;
1430 /* so that we exit with the right status */
1431 signal(SIGTERM, init_term);
1432 /* TODO(wad) self jail with seccomp_filters here. */
1433 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001434 /*
1435 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04001436 * left inside our pid namespace or we get a signal.
1437 */
1438 if (pid == rootpid)
1439 init_exitstatus = status;
1440 }
1441 if (!WIFEXITED(init_exitstatus))
1442 _exit(MINIJAIL_ERR_INIT);
1443 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04001444}
1445
Will Drewry6ac91122011-10-21 16:38:58 -05001446int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001447{
1448 size_t sz = 0;
1449 size_t bytes = read(fd, &sz, sizeof(sz));
1450 char *buf;
1451 int r;
1452 if (sizeof(sz) != bytes)
1453 return -EINVAL;
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001454 if (sz > USHRT_MAX) /* arbitrary sanity check */
Elly Jonese1749eb2011-10-07 13:54:59 -04001455 return -E2BIG;
1456 buf = malloc(sz);
1457 if (!buf)
1458 return -ENOMEM;
1459 bytes = read(fd, buf, sz);
1460 if (bytes != sz) {
1461 free(buf);
1462 return -EINVAL;
1463 }
1464 r = minijail_unmarshal(j, buf, sz);
1465 free(buf);
1466 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001467}
1468
Will Drewry6ac91122011-10-21 16:38:58 -05001469int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04001470{
1471 char *buf;
1472 size_t sz = minijail_size(j);
1473 ssize_t written;
1474 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04001475
Elly Jonese1749eb2011-10-07 13:54:59 -04001476 if (!sz)
1477 return -EINVAL;
1478 buf = malloc(sz);
1479 r = minijail_marshal(j, buf, sz);
1480 if (r) {
1481 free(buf);
1482 return r;
1483 }
1484 /* Sends [size][minijail]. */
1485 written = write(fd, &sz, sizeof(sz));
1486 if (written != sizeof(sz)) {
1487 free(buf);
1488 return -EFAULT;
1489 }
1490 written = write(fd, buf, sz);
1491 if (written < 0 || (size_t) written != sz) {
1492 free(buf);
1493 return -EFAULT;
1494 }
1495 free(buf);
1496 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001497}
Elly Jonescd7a9042011-07-22 13:56:51 -04001498
Will Drewry6ac91122011-10-21 16:38:58 -05001499int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04001500{
Daniel Erat5b7a3182015-08-19 16:06:22 -06001501#if defined(__ANDROID__)
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001502 /* Don't use LDPRELOAD on Brillo. */
1503 return 0;
1504#else
Elly Jonese1749eb2011-10-07 13:54:59 -04001505 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1506 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1507 if (!newenv)
1508 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04001509
Elly Jonese1749eb2011-10-07 13:54:59 -04001510 /* Only insert a separating space if we have something to separate... */
1511 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1512 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04001513
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001514 /* setenv() makes a copy of the string we give it. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001515 setenv(kLdPreloadEnvVar, newenv, 1);
1516 free(newenv);
1517 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001518#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04001519}
1520
Will Drewry6ac91122011-10-21 16:38:58 -05001521int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04001522{
1523 int r = pipe(fds);
1524 char fd_buf[11];
1525 if (r)
1526 return r;
1527 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1528 if (r <= 0)
1529 return -EINVAL;
1530 setenv(kFdEnvVar, fd_buf, 1);
1531 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001532}
1533
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001534int setup_pipe_end(int fds[2], size_t index)
1535{
1536 if (index > 1)
1537 return -1;
1538
1539 close(fds[1 - index]);
1540 return fds[index];
1541}
1542
1543int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1544{
1545 if (index > 1)
1546 return -1;
1547
1548 close(fds[1 - index]);
1549 /* dup2(2) the corresponding end of the pipe into |fd|. */
1550 return dup2(fds[index], fd);
1551}
1552
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001553int minijail_run_internal(struct minijail *j, const char *filename,
1554 char *const argv[], pid_t *pchild_pid,
1555 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1556 int use_preload);
1557
Will Drewry6ac91122011-10-21 16:38:58 -05001558int API minijail_run(struct minijail *j, const char *filename,
1559 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04001560{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001561 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1562 true);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001563}
1564
1565int API minijail_run_pid(struct minijail *j, const char *filename,
1566 char *const argv[], pid_t *pchild_pid)
1567{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001568 return minijail_run_internal(j, filename, argv, pchild_pid,
1569 NULL, NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001570}
1571
1572int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001573 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001574{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001575 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1576 NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001577}
1578
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001579int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001580 char *const argv[], pid_t *pchild_pid,
1581 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001582{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001583 return minijail_run_internal(j, filename, argv, pchild_pid,
1584 pstdin_fd, pstdout_fd, pstderr_fd, true);
1585}
1586
1587int API minijail_run_no_preload(struct minijail *j, const char *filename,
1588 char *const argv[])
1589{
1590 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1591 false);
1592}
1593
Samuel Tan63187f42015-10-16 13:01:53 -07001594int API minijail_run_pid_pipes_no_preload(struct minijail *j,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001595 const char *filename,
1596 char *const argv[],
Samuel Tan63187f42015-10-16 13:01:53 -07001597 pid_t *pchild_pid,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001598 int *pstdin_fd, int *pstdout_fd,
1599 int *pstderr_fd) {
Samuel Tan63187f42015-10-16 13:01:53 -07001600 return minijail_run_internal(j, filename, argv, pchild_pid,
1601 pstdin_fd, pstdout_fd, pstderr_fd, false);
1602}
1603
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001604int minijail_run_internal(struct minijail *j, const char *filename,
1605 char *const argv[], pid_t *pchild_pid,
1606 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1607 int use_preload)
1608{
Elly Jonese1749eb2011-10-07 13:54:59 -04001609 char *oldenv, *oldenv_copy = NULL;
1610 pid_t child_pid;
1611 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001612 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001613 int stdout_fds[2];
1614 int stderr_fds[2];
Dylan Reidce5b55e2016-01-13 11:04:16 -08001615 int child_sync_pipe_fds[2];
1616 int sync_child = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -04001617 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001618 /* We need to remember this across the minijail_preexec() call. */
1619 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001620 int do_init = j->flags.do_init;
Ben Chan541c7e52011-08-26 14:55:53 -07001621
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001622 if (use_preload) {
1623 oldenv = getenv(kLdPreloadEnvVar);
1624 if (oldenv) {
1625 oldenv_copy = strdup(oldenv);
1626 if (!oldenv_copy)
1627 return -ENOMEM;
1628 }
1629
1630 if (setup_preload())
1631 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04001632 }
Will Drewryf89aef52011-09-16 16:48:57 -05001633
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001634 if (!use_preload) {
1635 if (j->flags.caps)
1636 die("Capabilities are not supported without "
1637 "LD_PRELOAD");
1638 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05001639
Elly Jonesdd3e8512012-01-23 15:13:38 -05001640 /*
Jorge Lucangeli Obes3c84df12015-05-14 17:37:58 -07001641 * Make the process group ID of this process equal to its PID, so that
1642 * both the Minijail process and the jailed process can be killed
1643 * together.
1644 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1645 * the process is already a process group leader.
1646 */
1647 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1648 if (errno != EPERM) {
1649 pdie("setpgid(0, 0)");
1650 }
1651 }
1652
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001653 if (use_preload) {
1654 /*
1655 * Before we fork(2) and execve(2) the child process, we need
1656 * to open a pipe(2) to send the minijail configuration over.
1657 */
1658 if (setup_pipe(pipe_fds))
1659 return -EFAULT;
1660 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001661
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001662 /*
1663 * If we want to write to the child process' standard input,
1664 * create the pipe(2) now.
1665 */
1666 if (pstdin_fd) {
1667 if (pipe(stdin_fds))
1668 return -EFAULT;
1669 }
1670
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001671 /*
1672 * If we want to read from the child process' standard output,
1673 * create the pipe(2) now.
1674 */
1675 if (pstdout_fd) {
1676 if (pipe(stdout_fds))
1677 return -EFAULT;
1678 }
1679
1680 /*
1681 * If we want to read from the child process' standard error,
1682 * create the pipe(2) now.
1683 */
1684 if (pstderr_fd) {
1685 if (pipe(stderr_fds))
1686 return -EFAULT;
1687 }
1688
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001689 /*
1690 * If we want to set up a new uid/gid mapping in the user namespace,
1691 * create the pipe(2) to sync between parent and child.
1692 */
Dylan Reid605ce7f2016-01-19 19:21:00 -08001693 if (j->flags.userns || j->cgroup_count) {
Dylan Reidce5b55e2016-01-13 11:04:16 -08001694 sync_child = 1;
1695 if (pipe(child_sync_pipe_fds))
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001696 return -EFAULT;
1697 }
1698
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001699 /*
1700 * Use sys_clone() if and only if we're creating a pid namespace.
Elly Jones761b7412012-06-13 15:49:52 -04001701 *
1702 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1703 *
1704 * In multithreaded programs, there are a bunch of locks inside libc,
1705 * some of which may be held by other threads at the time that we call
1706 * minijail_run_pid(). If we call fork(), glibc does its level best to
1707 * ensure that we hold all of these locks before it calls clone()
1708 * internally and drop them after clone() returns, but when we call
1709 * sys_clone(2) directly, all that gets bypassed and we end up with a
1710 * child address space where some of libc's important locks are held by
1711 * other threads (which did not get cloned, and hence will never release
1712 * those locks). This is okay so long as we call exec() immediately
1713 * after, but a bunch of seemingly-innocent libc functions like setenv()
1714 * take locks.
1715 *
1716 * Hence, only call sys_clone() if we need to, in order to get at pid
1717 * namespacing. If we follow this path, the child's address space might
1718 * have broken locks; you may only call functions that do not acquire
1719 * any locks.
1720 *
1721 * Unfortunately, fork() acquires every lock it can get its hands on, as
1722 * previously detailed, so this function is highly likely to deadlock
1723 * later on (see "deadlock here") if we're multithreaded.
1724 *
1725 * We might hack around this by having the clone()d child (init of the
1726 * pid namespace) return directly, rather than leaving the clone()d
1727 * process hanging around to be init for the new namespace (and having
1728 * its fork()ed child return in turn), but that process would be crippled
1729 * with its libc locks potentially broken. We might try fork()ing in the
1730 * parent before we clone() to ensure that we own all the locks, but
1731 * then we have to have the forked child hanging around consuming
1732 * resources (and possibly having file descriptors / shared memory
1733 * regions / etc attached). We'd need to keep the child around to avoid
1734 * having its children get reparented to init.
1735 *
1736 * TODO(ellyjones): figure out if the "forked child hanging around"
1737 * problem is fixable or not. It would be nice if we worked in this
1738 * case.
1739 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001740 if (pid_namespace) {
1741 int clone_flags = CLONE_NEWPID | SIGCHLD;
1742 if (j->flags.userns)
1743 clone_flags |= CLONE_NEWUSER;
1744 child_pid = syscall(SYS_clone, clone_flags, NULL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001745 } else {
Elly Jones761b7412012-06-13 15:49:52 -04001746 child_pid = fork();
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001747 }
Elly Jones761b7412012-06-13 15:49:52 -04001748
Elly Jonese1749eb2011-10-07 13:54:59 -04001749 if (child_pid < 0) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001750 if (use_preload) {
1751 free(oldenv_copy);
1752 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001753 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001754 }
Will Drewryf89aef52011-09-16 16:48:57 -05001755
Elly Jonese1749eb2011-10-07 13:54:59 -04001756 if (child_pid) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001757 if (use_preload) {
1758 /* Restore parent's LD_PRELOAD. */
1759 if (oldenv_copy) {
1760 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1761 free(oldenv_copy);
1762 } else {
1763 unsetenv(kLdPreloadEnvVar);
1764 }
1765 unsetenv(kFdEnvVar);
Elly Jonese1749eb2011-10-07 13:54:59 -04001766 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001767
Elly Jonese1749eb2011-10-07 13:54:59 -04001768 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001769
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001770 if (j->flags.pid_file)
1771 write_pid_file(j);
1772
Dylan Reid605ce7f2016-01-19 19:21:00 -08001773 assign_cgroups(j);
1774
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001775 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08001776 write_ugid_mappings(j);
1777
1778 if (sync_child)
1779 parent_setup_complete(child_sync_pipe_fds);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001780
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001781 if (use_preload) {
1782 /* Send marshalled minijail. */
1783 close(pipe_fds[0]); /* read endpoint */
1784 ret = minijail_to_fd(j, pipe_fds[1]);
1785 close(pipe_fds[1]); /* write endpoint */
1786 if (ret) {
1787 kill(j->initpid, SIGKILL);
1788 die("failed to send marshalled minijail");
1789 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001790 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001791
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001792 if (pchild_pid)
1793 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001794
1795 /*
1796 * If we want to write to the child process' standard input,
1797 * set up the write end of the pipe.
1798 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001799 if (pstdin_fd)
1800 *pstdin_fd = setup_pipe_end(stdin_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001801 1 /* write end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001802
1803 /*
1804 * If we want to read from the child process' standard output,
1805 * set up the read end of the pipe.
1806 */
1807 if (pstdout_fd)
1808 *pstdout_fd = setup_pipe_end(stdout_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001809 0 /* read end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001810
1811 /*
1812 * If we want to read from the child process' standard error,
1813 * set up the read end of the pipe.
1814 */
1815 if (pstderr_fd)
1816 *pstderr_fd = setup_pipe_end(stderr_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001817 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001818
Elly Jonese1749eb2011-10-07 13:54:59 -04001819 return 0;
1820 }
1821 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001822
Peter Qiu2860c462015-12-16 15:13:06 -08001823 if (j->flags.reset_signal_mask) {
1824 sigset_t signal_mask;
1825 if (sigemptyset(&signal_mask) != 0)
1826 pdie("sigemptyset failed");
1827 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
1828 pdie("sigprocmask failed");
1829 }
1830
Dylan Reidce5b55e2016-01-13 11:04:16 -08001831 if (sync_child)
1832 wait_for_parent_setup(child_sync_pipe_fds);
1833
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001834 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08001835 enter_user_namespace(j);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001836
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001837 /*
1838 * If we want to write to the jailed process' standard input,
1839 * set up the read end of the pipe.
1840 */
1841 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001842 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1843 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001844 die("failed to set up stdin pipe");
1845 }
1846
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001847 /*
1848 * If we want to read from the jailed process' standard output,
1849 * set up the write end of the pipe.
1850 */
1851 if (pstdout_fd) {
1852 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1853 STDOUT_FILENO) < 0)
1854 die("failed to set up stdout pipe");
1855 }
1856
1857 /*
1858 * If we want to read from the jailed process' standard error,
1859 * set up the write end of the pipe.
1860 */
1861 if (pstderr_fd) {
1862 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1863 STDERR_FILENO) < 0)
1864 die("failed to set up stderr pipe");
1865 }
1866
Dylan Reid791f5772015-09-14 20:02:42 -07001867 /* If running an init program, let it decide when/how to mount /proc. */
1868 if (pid_namespace && !do_init)
1869 j->flags.remount_proc_ro = 0;
1870
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001871 if (use_preload) {
1872 /* Strip out flags that cannot be inherited across execve(2). */
1873 minijail_preexec(j);
1874 } else {
1875 j->flags.pids = 0;
1876 }
1877 /* Jail this process, then execve() the target. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001878 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001879
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001880 if (pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001881 /*
1882 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001883 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001884 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001885 * a child to actually run the program. If |do_init == 0|, we
1886 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04001887 *
1888 * If we're multithreaded, we'll probably deadlock here. See
1889 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001890 */
1891 child_pid = fork();
1892 if (child_pid < 0)
1893 _exit(child_pid);
1894 else if (child_pid > 0)
1895 init(child_pid); /* never returns */
1896 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001897
Elly Jonesdd3e8512012-01-23 15:13:38 -05001898 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001899 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04001900 * calling process
1901 * -> execve()-ing process
1902 * If we are:
1903 * calling process
1904 * -> init()-ing process
1905 * -> execve()-ing process
1906 */
1907 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001908}
1909
Will Drewry6ac91122011-10-21 16:38:58 -05001910int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001911{
1912 int st;
1913 if (kill(j->initpid, SIGTERM))
1914 return -errno;
1915 if (waitpid(j->initpid, &st, 0) < 0)
1916 return -errno;
1917 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001918}
1919
Will Drewry6ac91122011-10-21 16:38:58 -05001920int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001921{
1922 int st;
1923 if (waitpid(j->initpid, &st, 0) < 0)
1924 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001925
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001926 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001927 int error_status = st;
1928 if (WIFSIGNALED(st)) {
1929 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001930 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001931 j->initpid, signum);
1932 /*
1933 * We return MINIJAIL_ERR_JAIL if the process received
1934 * SIGSYS, which happens when a syscall is blocked by
1935 * seccomp filters.
1936 * If not, we do what bash(1) does:
1937 * $? = 128 + signum
1938 */
1939 if (signum == SIGSYS) {
1940 error_status = MINIJAIL_ERR_JAIL;
1941 } else {
1942 error_status = 128 + signum;
1943 }
1944 }
1945 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001946 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001947
1948 int exit_status = WEXITSTATUS(st);
1949 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001950 info("child process %d exited with status %d",
1951 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001952
1953 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001954}
1955
Will Drewry6ac91122011-10-21 16:38:58 -05001956void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001957{
Dylan Reid605ce7f2016-01-19 19:21:00 -08001958 size_t i;
1959
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001960 if (j->flags.seccomp_filter && j->filter_prog) {
1961 free(j->filter_prog->filter);
1962 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001963 }
Dylan Reid648b2202015-10-23 00:50:00 -07001964 while (j->mounts_head) {
1965 struct mountpoint *m = j->mounts_head;
1966 j->mounts_head = j->mounts_head->next;
1967 free(m->type);
1968 free(m->dest);
1969 free(m->src);
1970 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001971 }
Dylan Reid648b2202015-10-23 00:50:00 -07001972 j->mounts_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001973 if (j->user)
1974 free(j->user);
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -08001975 if (j->suppl_gid_list)
1976 free(j->suppl_gid_list);
Will Drewrybee7ba72011-10-21 20:47:01 -05001977 if (j->chrootdir)
1978 free(j->chrootdir);
Andrew Brestickereac28942015-11-11 16:04:46 -08001979 if (j->alt_syscall_table)
1980 free(j->alt_syscall_table);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001981 for (i = 0; i < j->cgroup_count; ++i)
1982 free(j->cgroups[i]);
Elly Jonese1749eb2011-10-07 13:54:59 -04001983 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001984}