blob: e8374145ef7a6cd28791b2898ae836ad075dcf2b [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
Arthur Gautier7a569072016-04-23 17:25:20 +00007#define _DEFAULT_SOURCE
Elly Jonescd7a9042011-07-22 13:56:51 -04008#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07009
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080010#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050011#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040012#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070013#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040014#include <grp.h>
15#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050016#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040018#include <pwd.h>
19#include <sched.h>
20#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050021#include <stdarg.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070022#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080023#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040024#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27#include <syscall.h>
28#include <sys/capability.h>
29#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050030#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040031#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070032#include <sys/stat.h>
33#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080034#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040036#include <unistd.h>
37
38#include "libminijail.h"
39#include "libminijail-private.h"
40
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070041#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080042#include "syscall_filter.h"
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040043#include "syscall_wrapper.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070044#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080045
Lei Zhangeee31552012-10-17 21:27:10 -070046#ifdef HAVE_SECUREBITS_H
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -070047# include <linux/securebits.h>
Lei Zhangeee31552012-10-17 21:27:10 -070048#else
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -070049# define SECURE_ALL_BITS 0x55
50# define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
51#endif
52/* For kernels < 4.3. */
53#define OLD_SECURE_ALL_BITS 0x15
54#define OLD_SECURE_ALL_LOCKS (OLD_SECURE_ALL_BITS << 1)
55
56/*
57 * Assert the value of SECURE_ALL_BITS at compile-time.
58 * Brillo devices are currently compiled against 4.4 kernel headers. Kernel 4.3
59 * added a new securebit.
60 * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM
61 * when used on older kernels. The compile-time assert will catch this situation
62 * at compile time.
63 */
64#ifdef __BRILLO__
65_Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55.");
Lei Zhangeee31552012-10-17 21:27:10 -070066#endif
67
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -070068/* Until these are reliably available in linux/prctl.h. */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080069#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070070# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080071#endif
72
Andrew Brestickereac28942015-11-11 16:04:46 -080073#ifndef PR_ALT_SYSCALL
74# define PR_ALT_SYSCALL 0x43724f53
75#endif
76
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040077/* Seccomp filter related flags. */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080078#ifndef PR_SET_NO_NEW_PRIVS
79# define PR_SET_NO_NEW_PRIVS 38
80#endif
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040081
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080082#ifndef SECCOMP_MODE_FILTER
83# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050084#endif
85
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040086#ifndef SECCOMP_SET_MODE_STRICT
87# define SECCOMP_SET_MODE_STRICT 0
88#endif
89#ifndef SECCOMP_SET_MODE_FILTER
90# define SECCOMP_SET_MODE_FILTER 1
91#endif
92
93#ifndef SECCOMP_FILTER_FLAG_TSYNC
94# define SECCOMP_FILTER_FLAG_TSYNC 1
95#endif
96/* End seccomp filter related flags. */
97
Dylan Reid4cbc2a52016-06-17 19:06:07 -070098/* New cgroup namespace might not be in linux-headers yet. */
99#ifndef CLONE_NEWCGROUP
100# define CLONE_NEWCGROUP 0x02000000
101#endif
102
Dylan Reid605ce7f2016-01-19 19:21:00 -0800103#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
104
Dylan Reid648b2202015-10-23 00:50:00 -0700105struct mountpoint {
Elly Jones51a5b6c2011-10-12 19:09:26 -0400106 char *src;
107 char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -0700108 char *type;
Dylan Reid81e23972016-05-18 14:06:35 -0700109 char *data;
110 int has_data;
Dylan Reid648b2202015-10-23 00:50:00 -0700111 unsigned long flags;
112 struct mountpoint *next;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400113};
114
Will Drewryf89aef52011-09-16 16:48:57 -0500115struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700116 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700117 * WARNING: if you add a flag here you need to make sure it's
118 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700119 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400120 struct {
121 int uid:1;
122 int gid:1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800123 int usergroups:1;
124 int suppl_gids:1;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800125 int use_caps:1;
126 int capbset_drop:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400127 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700128 int enter_vfs:1;
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800129 int skip_remount_private:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400130 int pids:1;
Dylan Reidf7942472015-11-18 17:55:26 -0800131 int ipc:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400132 int net:1;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700133 int enter_net:1;
Dylan Reid4cbc2a52016-06-17 19:06:07 -0700134 int ns_cgroups:1;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800135 int userns:1;
Jorge Lucangeli Obes200299c2016-09-23 15:21:57 -0400136 int disable_setgroups:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400137 int seccomp:1;
Dylan Reid791f5772015-09-14 20:02:42 -0700138 int remount_proc_ro:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700139 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400140 int seccomp_filter:1;
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400141 int seccomp_filter_tsync:1;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400142 int seccomp_filter_logging:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400143 int chroot:1;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800144 int pivot_root:1;
Lee Campbell11af0622014-05-22 12:36:04 -0700145 int mount_tmp:1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800146 int do_init:1;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800147 int pid_file:1;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800148 int cgroups:1;
Andrew Brestickereac28942015-11-11 16:04:46 -0800149 int alt_syscall:1;
Peter Qiu2860c462015-12-16 15:13:06 -0800150 int reset_signal_mask:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400151 } flags;
152 uid_t uid;
153 gid_t gid;
154 gid_t usergid;
155 char *user;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800156 size_t suppl_gid_count;
157 gid_t *suppl_gid_list;
Elly Jonese1749eb2011-10-07 13:54:59 -0400158 uint64_t caps;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800159 uint64_t cap_bset;
Elly Jonese1749eb2011-10-07 13:54:59 -0400160 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700161 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700162 int netns_fd;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400163 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800164 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800165 char *uidmap;
166 char *gidmap;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800167 size_t filter_len;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800168 struct sock_fprog *filter_prog;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800169 char *alt_syscall_table;
Dylan Reid648b2202015-10-23 00:50:00 -0700170 struct mountpoint *mounts_head;
171 struct mountpoint *mounts_tail;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800172 size_t mounts_count;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800173 char *cgroups[MAX_CGROUPS];
174 size_t cgroup_count;
Will Drewryf89aef52011-09-16 16:48:57 -0500175};
176
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700177/*
178 * Strip out flags meant for the parent.
179 * We keep things that are not inherited across execve(2) (e.g. capabilities),
180 * or are easier to set after execve(2) (e.g. seccomp filters).
181 */
182void minijail_preenter(struct minijail *j)
183{
184 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700185 j->flags.enter_vfs = 0;
Shuhei Takahashi3da40312016-03-07 17:37:49 +0900186 j->flags.skip_remount_private = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700187 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700188 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800189 j->flags.do_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800190 j->flags.pid_file = 0;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800191 j->flags.cgroups = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700192}
193
194/*
195 * Strip out flags meant for the child.
196 * We keep things that are inherited across execve(2).
197 */
198void minijail_preexec(struct minijail *j)
199{
200 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700201 int enter_vfs = j->flags.enter_vfs;
Jorge Lucangeli Obes87bf01d2016-03-08 11:20:03 -0800202 int skip_remount_private = j->flags.skip_remount_private;
Dylan Reid791f5772015-09-14 20:02:42 -0700203 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800204 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700205 if (j->user)
206 free(j->user);
207 j->user = NULL;
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -0800208 if (j->suppl_gid_list)
209 free(j->suppl_gid_list);
210 j->suppl_gid_list = NULL;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700211 memset(&j->flags, 0, sizeof(j->flags));
212 /* Now restore anything we meant to keep. */
213 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700214 j->flags.enter_vfs = enter_vfs;
Jorge Lucangeli Obes87bf01d2016-03-08 11:20:03 -0800215 j->flags.skip_remount_private = skip_remount_private;
Dylan Reid791f5772015-09-14 20:02:42 -0700216 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800217 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700218 /* Note, |pids| will already have been used before this call. */
219}
220
221/* Minijail API. */
222
Will Drewry6ac91122011-10-21 16:38:58 -0500223struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400224{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400225 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400226}
227
Will Drewry6ac91122011-10-21 16:38:58 -0500228void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400229{
230 if (uid == 0)
231 die("useless change to uid 0");
232 j->uid = uid;
233 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400234}
235
Will Drewry6ac91122011-10-21 16:38:58 -0500236void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400237{
238 if (gid == 0)
239 die("useless change to gid 0");
240 j->gid = gid;
241 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400242}
243
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800244void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
245 const gid_t *list)
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800246{
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800247 size_t i;
248
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800249 if (j->flags.usergroups)
250 die("cannot inherit *and* set supplementary groups");
251
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800252 if (size == 0) {
253 /* Clear supplementary groups. */
254 j->suppl_gid_list = NULL;
255 j->suppl_gid_count = 0;
256 j->flags.suppl_gids = 1;
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800257 return;
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800258 }
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800259
260 /* Copy the gid_t array. */
261 j->suppl_gid_list = calloc(size, sizeof(gid_t));
262 if (!j->suppl_gid_list) {
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800263 die("failed to allocate internal supplementary group array");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800264 }
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800265 for (i = 0; i < size; i++) {
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800266 j->suppl_gid_list[i] = list[i];
267 }
268 j->suppl_gid_count = size;
269 j->flags.suppl_gids = 1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800270}
271
Will Drewry6ac91122011-10-21 16:38:58 -0500272int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400273{
274 char *buf = NULL;
275 struct passwd pw;
276 struct passwd *ppw = NULL;
277 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
278 if (sz == -1)
279 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400280
Elly Jonesdd3e8512012-01-23 15:13:38 -0500281 /*
282 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400283 * the maximum needed size of the buffer, so we don't have to search.
284 */
285 buf = malloc(sz);
286 if (!buf)
287 return -ENOMEM;
288 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500289 /*
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800290 * We're safe to free the buffer here. The strings inside |pw| point
291 * inside |buf|, but we don't use any of them; this leaves the pointers
Jorge Lucangeli Obes87bf01d2016-03-08 11:20:03 -0800292 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
293 * succeeded.
Elly Jonesdd3e8512012-01-23 15:13:38 -0500294 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400295 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700296 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400297 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700298 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400299 minijail_change_uid(j, ppw->pw_uid);
300 j->user = strdup(user);
301 if (!j->user)
302 return -ENOMEM;
303 j->usergid = ppw->pw_gid;
304 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400305}
306
Will Drewry6ac91122011-10-21 16:38:58 -0500307int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400308{
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -0700309 char *buf = NULL;
Yabin Cui1b21c8f2015-07-22 10:34:45 -0700310 struct group gr;
311 struct group *pgr = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400312 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
313 if (sz == -1)
314 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400315
Elly Jonesdd3e8512012-01-23 15:13:38 -0500316 /*
317 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400318 * the maximum needed size of the buffer, so we don't have to search.
319 */
320 buf = malloc(sz);
321 if (!buf)
322 return -ENOMEM;
323 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500324 /*
325 * We're safe to free the buffer here. The strings inside gr point
326 * inside buf, but we don't use any of them; this leaves the pointers
327 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
328 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400329 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700330 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400331 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700332 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400333 minijail_change_gid(j, pgr->gr_gid);
334 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400335}
336
Will Drewry6ac91122011-10-21 16:38:58 -0500337void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400338{
339 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400340}
341
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700342void API minijail_no_new_privs(struct minijail *j)
343{
344 j->flags.no_new_privs = 1;
345}
346
Will Drewry6ac91122011-10-21 16:38:58 -0500347void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400348{
349 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500350}
351
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400352void API minijail_set_seccomp_filter_tsync(struct minijail *j)
353{
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400354 if (j->filter_len > 0 && j->filter_prog != NULL) {
355 die("minijail_set_seccomp_filter_tsync() must be called "
356 "before minijail_parse_seccomp_filters()");
357 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400358 j->flags.seccomp_filter_tsync = 1;
359}
360
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700361void API minijail_log_seccomp_filter_failures(struct minijail *j)
362{
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400363 if (j->filter_len > 0 && j->filter_prog != NULL) {
364 die("minijail_log_seccomp_filter_failures() must be called "
365 "before minijail_parse_seccomp_filters()");
366 }
367 j->flags.seccomp_filter_logging = 1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700368}
369
Will Drewry6ac91122011-10-21 16:38:58 -0500370void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400371{
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800372 /*
373 * 'minijail_use_caps' configures a runtime-capabilities-only
374 * environment, including a bounding set matching the thread's runtime
375 * (permitted|inheritable|effective) sets.
376 * Therefore, it will override any existing bounding set configurations
377 * since the latter would allow gaining extra runtime capabilities from
378 * file capabilities.
379 */
380 if (j->flags.capbset_drop) {
381 warn("overriding bounding set configuration");
382 j->cap_bset = 0;
383 j->flags.capbset_drop = 0;
384 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400385 j->caps = capmask;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800386 j->flags.use_caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400387}
388
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800389void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
390{
391 if (j->flags.use_caps) {
392 /*
393 * 'minijail_use_caps' will have already configured a capability
394 * bounding set matching the (permitted|inheritable|effective)
395 * sets. Abort if the user tries to configure a separate
396 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
397 * are mutually exclusive.
398 */
399 die("runtime capabilities already configured, can't drop "
400 "bounding set separately");
401 }
402 j->cap_bset = capmask;
403 j->flags.capbset_drop = 1;
404}
405
406void API minijail_reset_signal_mask(struct minijail *j)
407{
Peter Qiu2860c462015-12-16 15:13:06 -0800408 j->flags.reset_signal_mask = 1;
409}
410
Will Drewry6ac91122011-10-21 16:38:58 -0500411void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400412{
413 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400414}
415
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700416void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
417{
Ricky Zhoubce609d2016-03-02 21:47:56 -0800418 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700419 if (ns_fd < 0) {
420 pdie("failed to open namespace '%s'", ns_path);
421 }
422 j->mountns_fd = ns_fd;
423 j->flags.enter_vfs = 1;
424}
425
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800426void API minijail_skip_remount_private(struct minijail *j)
427{
428 j->flags.skip_remount_private = 1;
429}
430
Will Drewry6ac91122011-10-21 16:38:58 -0500431void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400432{
Elly Jonese58176c2012-01-23 11:46:17 -0500433 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700434 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400435 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800436 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400437}
438
Dylan Reidf7942472015-11-18 17:55:26 -0800439void API minijail_namespace_ipc(struct minijail *j)
440{
441 j->flags.ipc = 1;
442}
443
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400444void API minijail_namespace_net(struct minijail *j)
445{
446 j->flags.net = 1;
447}
448
Dylan Reid1102f5a2015-09-15 11:52:20 -0700449void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
450{
Ricky Zhoubce609d2016-03-02 21:47:56 -0800451 int ns_fd = open(ns_path, O_RDONLY | O_CLOEXEC);
Dylan Reid1102f5a2015-09-15 11:52:20 -0700452 if (ns_fd < 0) {
453 pdie("failed to open namespace '%s'", ns_path);
454 }
455 j->netns_fd = ns_fd;
456 j->flags.enter_net = 1;
457}
458
Dylan Reid4cbc2a52016-06-17 19:06:07 -0700459void API minijail_namespace_cgroups(struct minijail *j)
460{
461 j->flags.ns_cgroups = 1;
462}
463
Dylan Reid791f5772015-09-14 20:02:42 -0700464void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400465{
466 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700467 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400468}
469
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800470void API minijail_namespace_user(struct minijail *j)
471{
472 j->flags.userns = 1;
473}
474
Jorge Lucangeli Obes200299c2016-09-23 15:21:57 -0400475void API minijail_namespace_user_disable_setgroups(struct minijail *j)
476{
477 j->flags.disable_setgroups = 1;
478}
479
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800480int API minijail_uidmap(struct minijail *j, const char *uidmap)
481{
482 j->uidmap = strdup(uidmap);
483 if (!j->uidmap)
484 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800485 char *ch;
486 for (ch = j->uidmap; *ch; ch++) {
487 if (*ch == ',')
488 *ch = '\n';
489 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800490 return 0;
491}
492
493int API minijail_gidmap(struct minijail *j, const char *gidmap)
494{
495 j->gidmap = strdup(gidmap);
496 if (!j->gidmap)
497 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800498 char *ch;
499 for (ch = j->gidmap; *ch; ch++) {
500 if (*ch == ',')
501 *ch = '\n';
502 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800503 return 0;
504}
505
Will Drewry6ac91122011-10-21 16:38:58 -0500506void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400507{
508 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400509}
510
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800511void API minijail_run_as_init(struct minijail *j)
512{
513 /*
514 * Since the jailed program will become 'init' in the new PID namespace,
515 * Minijail does not need to fork an 'init' process.
516 */
517 j->flags.do_init = 0;
518}
519
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700520int API minijail_enter_chroot(struct minijail *j, const char *dir)
521{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400522 if (j->chrootdir)
523 return -EINVAL;
524 j->chrootdir = strdup(dir);
525 if (!j->chrootdir)
526 return -ENOMEM;
527 j->flags.chroot = 1;
528 return 0;
529}
530
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800531int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
532{
533 if (j->chrootdir)
534 return -EINVAL;
535 j->chrootdir = strdup(dir);
536 if (!j->chrootdir)
537 return -ENOMEM;
538 j->flags.pivot_root = 1;
539 return 0;
540}
541
Dylan Reida14e08d2015-10-22 21:05:29 -0700542char API *minijail_get_original_path(struct minijail *j,
543 const char *path_inside_chroot)
544{
Dylan Reid648b2202015-10-23 00:50:00 -0700545 struct mountpoint *b;
Dylan Reida14e08d2015-10-22 21:05:29 -0700546
Dylan Reid648b2202015-10-23 00:50:00 -0700547 b = j->mounts_head;
Dylan Reida14e08d2015-10-22 21:05:29 -0700548 while (b) {
549 /*
550 * If |path_inside_chroot| is the exact destination of a
Dylan Reid648b2202015-10-23 00:50:00 -0700551 * mount, then the original path is exactly the source of
552 * the mount.
Dylan Reida14e08d2015-10-22 21:05:29 -0700553 * for example: "-b /some/path/exe,/chroot/path/exe"
Dylan Reid648b2202015-10-23 00:50:00 -0700554 * mount source = /some/path/exe, mount dest =
555 * /chroot/path/exe Then when getting the original path of
556 * "/chroot/path/exe", the source of that mount,
557 * "/some/path/exe" is what should be returned.
Dylan Reida14e08d2015-10-22 21:05:29 -0700558 */
559 if (!strcmp(b->dest, path_inside_chroot))
560 return strdup(b->src);
561
562 /*
563 * If |path_inside_chroot| is within the destination path of a
Dylan Reid648b2202015-10-23 00:50:00 -0700564 * mount, take the suffix of the chroot path relative to the
565 * mount destination path, and append it to the mount source
566 * path.
Dylan Reida14e08d2015-10-22 21:05:29 -0700567 */
568 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
569 const char *relative_path =
570 path_inside_chroot + strlen(b->dest);
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400571 return path_join(b->src, relative_path);
Dylan Reida14e08d2015-10-22 21:05:29 -0700572 }
573 b = b->next;
574 }
575
576 /* If there is a chroot path, append |path_inside_chroot| to that. */
577 if (j->chrootdir)
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400578 return path_join(j->chrootdir, path_inside_chroot);
Dylan Reida14e08d2015-10-22 21:05:29 -0700579
580 /* No chroot, so the path outside is the same as it is inside. */
581 return strdup(path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700582}
583
Lee Campbell11af0622014-05-22 12:36:04 -0700584void API minijail_mount_tmp(struct minijail *j)
585{
586 j->flags.mount_tmp = 1;
587}
588
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800589int API minijail_write_pid_file(struct minijail *j, const char *path)
590{
591 j->pid_file_path = strdup(path);
592 if (!j->pid_file_path)
593 return -ENOMEM;
594 j->flags.pid_file = 1;
595 return 0;
596}
597
Dylan Reid605ce7f2016-01-19 19:21:00 -0800598int API minijail_add_to_cgroup(struct minijail *j, const char *path)
599{
600 if (j->cgroup_count >= MAX_CGROUPS)
601 return -ENOMEM;
602 j->cgroups[j->cgroup_count] = strdup(path);
603 if (!j->cgroups[j->cgroup_count])
604 return -ENOMEM;
605 j->cgroup_count++;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800606 j->flags.cgroups = 1;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800607 return 0;
608}
609
Dylan Reid81e23972016-05-18 14:06:35 -0700610int API minijail_mount_with_data(struct minijail *j, const char *src,
611 const char *dest, const char *type,
612 unsigned long flags, const char *data)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700613{
Dylan Reid648b2202015-10-23 00:50:00 -0700614 struct mountpoint *m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400615
616 if (*dest != '/')
617 return -EINVAL;
Dylan Reid648b2202015-10-23 00:50:00 -0700618 m = calloc(1, sizeof(*m));
619 if (!m)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400620 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700621 m->dest = strdup(dest);
622 if (!m->dest)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400623 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700624 m->src = strdup(src);
625 if (!m->src)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400626 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700627 m->type = strdup(type);
628 if (!m->type)
629 goto error;
Dylan Reid81e23972016-05-18 14:06:35 -0700630 if (data) {
631 m->data = strdup(data);
632 if (!m->data)
633 goto error;
634 m->has_data = 1;
635 }
Dylan Reid648b2202015-10-23 00:50:00 -0700636 m->flags = flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400637
Jorge Lucangeli Obes6c755d22016-01-28 15:24:40 -0800638 info("mount %s -> %s type '%s'", src, dest, type);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400639
Elly Jonesdd3e8512012-01-23 15:13:38 -0500640 /*
Dylan Reid648b2202015-10-23 00:50:00 -0700641 * Force vfs namespacing so the mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400642 * containing vfs namespace.
643 */
644 minijail_namespace_vfs(j);
645
Dylan Reid648b2202015-10-23 00:50:00 -0700646 if (j->mounts_tail)
647 j->mounts_tail->next = m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400648 else
Dylan Reid648b2202015-10-23 00:50:00 -0700649 j->mounts_head = m;
650 j->mounts_tail = m;
651 j->mounts_count++;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400652
653 return 0;
654
655error:
Dylan Reid81e23972016-05-18 14:06:35 -0700656 free(m->type);
Dylan Reid648b2202015-10-23 00:50:00 -0700657 free(m->src);
658 free(m->dest);
659 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400660 return -ENOMEM;
661}
662
Dylan Reid81e23972016-05-18 14:06:35 -0700663int API minijail_mount(struct minijail *j, const char *src, const char *dest,
664 const char *type, unsigned long flags)
665{
666 return minijail_mount_with_data(j, src, dest, type, flags, NULL);
667}
668
Dylan Reid648b2202015-10-23 00:50:00 -0700669int API minijail_bind(struct minijail *j, const char *src, const char *dest,
670 int writeable)
671{
672 unsigned long flags = MS_BIND;
673
674 if (!writeable)
675 flags |= MS_RDONLY;
676
677 return minijail_mount(j, src, dest, "", flags);
678}
679
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400680static void clear_seccomp_options(struct minijail *j)
681{
682 j->flags.seccomp_filter = 0;
683 j->flags.seccomp_filter_tsync = 0;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400684 j->flags.seccomp_filter_logging = 0;
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400685 j->filter_len = 0;
686 j->filter_prog = NULL;
687 j->flags.no_new_privs = 0;
688}
689
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400690static int seccomp_should_parse_filters(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400691{
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400692 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400693 /*
694 * |errno| will be set to EINVAL when seccomp has not been
695 * compiled into the kernel. On certain platforms and kernel
696 * versions this is not a fatal failure. In that case, and only
697 * in that case, disable seccomp and skip loading the filters.
698 */
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400699 if ((errno == EINVAL) && seccomp_can_softfail()) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400700 warn("not loading seccomp filters, seccomp filter not "
701 "supported");
702 clear_seccomp_options(j);
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400703 return 0;
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700704 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400705 /*
706 * If |errno| != EINVAL or seccomp_can_softfail() is false,
707 * we can proceed. Worst case scenario minijail_enter() will
708 * abort() if seccomp fails.
709 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700710 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400711 if (j->flags.seccomp_filter_tsync) {
712 /* Are the seccomp(2) syscall and the TSYNC option supported? */
713 if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
714 SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) {
715 int saved_errno = errno;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400716 if (saved_errno == ENOSYS && seccomp_can_softfail()) {
717 warn("seccomp(2) syscall not supported");
718 clear_seccomp_options(j);
719 return 0;
720 } else if (saved_errno == EINVAL &&
721 seccomp_can_softfail()) {
722 warn(
723 "seccomp filter thread sync not supported");
724 clear_seccomp_options(j);
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400725 return 0;
726 }
727 /*
728 * Similar logic here. If seccomp_can_softfail() is
729 * false, or |errno| != ENOSYS, or |errno| != EINVAL,
730 * we can proceed. Worst case scenario minijail_enter()
731 * will abort() if seccomp or TSYNC fail.
732 */
733 }
734 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400735 return 1;
736}
737
738static int parse_seccomp_filters(struct minijail *j, FILE *policy_file)
739{
740 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400741 int use_ret_trap =
742 j->flags.seccomp_filter_tsync || j->flags.seccomp_filter_logging;
743 int allow_logging = j->flags.seccomp_filter_logging;
744
745 if (compile_filter(policy_file, fprog, use_ret_trap, allow_logging)) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400746 free(fprog);
747 return -1;
748 }
749
750 j->filter_len = fprog->len;
751 j->filter_prog = fprog;
752 return 0;
753}
754
755void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
756{
757 if (!seccomp_should_parse_filters(j))
758 return;
759
Elly Jonese1749eb2011-10-07 13:54:59 -0400760 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800761 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700762 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400763 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800764
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400765 if (parse_seccomp_filters(j, file) != 0) {
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700766 die("failed to compile seccomp filter BPF program in '%s'",
767 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800768 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400769 fclose(file);
770}
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800771
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400772void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd)
773{
774 if (!seccomp_should_parse_filters(j))
775 return;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800776
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -0400777 FILE *file = fdopen(fd, "r");
778 if (!file) {
779 pdie("failed to associate stream with fd %d", fd);
780 }
781
782 if (parse_seccomp_filters(j, file) != 0) {
783 die("failed to compile seccomp filter BPF program from fd %d",
784 fd);
785 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400786 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500787}
788
Andrew Brestickereac28942015-11-11 16:04:46 -0800789int API minijail_use_alt_syscall(struct minijail *j, const char *table)
790{
791 j->alt_syscall_table = strdup(table);
792 if (!j->alt_syscall_table)
793 return -ENOMEM;
794 j->flags.alt_syscall = 1;
795 return 0;
796}
797
Will Drewryf89aef52011-09-16 16:48:57 -0500798struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400799 size_t available;
800 size_t total;
801 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500802};
803
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800804void marshal_state_init(struct marshal_state *state, char *buf,
805 size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400806{
807 state->available = available;
808 state->buf = buf;
809 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500810}
811
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800812void marshal_append(struct marshal_state *state, void *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400813{
814 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500815
Elly Jonese1749eb2011-10-07 13:54:59 -0400816 /* Up to |available| will be written. */
817 if (copy_len) {
818 memcpy(state->buf, src, copy_len);
819 state->buf += copy_len;
820 state->available -= copy_len;
821 }
822 /* |total| will contain the expected length. */
823 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500824}
825
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400826void marshal_mount(struct marshal_state *state, const struct mountpoint *m)
Dylan Reid81e23972016-05-18 14:06:35 -0700827{
828 marshal_append(state, m->src, strlen(m->src) + 1);
829 marshal_append(state, m->dest, strlen(m->dest) + 1);
830 marshal_append(state, m->type, strlen(m->type) + 1);
831 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data));
832 if (m->has_data)
833 marshal_append(state, m->data, strlen(m->data) + 1);
834 marshal_append(state, (char *)&m->flags, sizeof(m->flags));
835}
836
Will Drewry6ac91122011-10-21 16:38:58 -0500837void minijail_marshal_helper(struct marshal_state *state,
838 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400839{
Dylan Reid648b2202015-10-23 00:50:00 -0700840 struct mountpoint *m = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800841 size_t i;
842
Elly Jonese1749eb2011-10-07 13:54:59 -0400843 marshal_append(state, (char *)j, sizeof(*j));
844 if (j->user)
845 marshal_append(state, j->user, strlen(j->user) + 1);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800846 if (j->suppl_gid_list) {
847 marshal_append(state, j->suppl_gid_list,
848 j->suppl_gid_count * sizeof(gid_t));
849 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400850 if (j->chrootdir)
851 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Andrew Brestickereac28942015-11-11 16:04:46 -0800852 if (j->alt_syscall_table) {
853 marshal_append(state, j->alt_syscall_table,
854 strlen(j->alt_syscall_table) + 1);
855 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800856 if (j->flags.seccomp_filter && j->filter_prog) {
857 struct sock_fprog *fp = j->filter_prog;
858 marshal_append(state, (char *)fp->filter,
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800859 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400860 }
Dylan Reid648b2202015-10-23 00:50:00 -0700861 for (m = j->mounts_head; m; m = m->next) {
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400862 marshal_mount(state, m);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400863 }
Dylan Reid605ce7f2016-01-19 19:21:00 -0800864 for (i = 0; i < j->cgroup_count; ++i)
865 marshal_append(state, j->cgroups[i], strlen(j->cgroups[i]) + 1);
Will Drewryf89aef52011-09-16 16:48:57 -0500866}
867
Will Drewry6ac91122011-10-21 16:38:58 -0500868size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400869{
870 struct marshal_state state;
871 marshal_state_init(&state, NULL, 0);
872 minijail_marshal_helper(&state, j);
873 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500874}
875
Elly Jonese1749eb2011-10-07 13:54:59 -0400876int minijail_marshal(const struct minijail *j, char *buf, size_t available)
877{
878 struct marshal_state state;
879 marshal_state_init(&state, buf, available);
880 minijail_marshal_helper(&state, j);
881 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500882}
883
Elly Jonese1749eb2011-10-07 13:54:59 -0400884int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
885{
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800886 size_t i;
887 size_t count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500888 int ret = -EINVAL;
889
Elly Jonese1749eb2011-10-07 13:54:59 -0400890 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500891 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400892 memcpy((void *)j, serialized, sizeof(*j));
893 serialized += sizeof(*j);
894 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500895
Will Drewrybee7ba72011-10-21 20:47:01 -0500896 /* Potentially stale pointers not used as signals. */
Jorge Lucangeli Obes3b2e6e42016-08-04 12:26:19 -0400897 j->pid_file_path = NULL;
898 j->uidmap = NULL;
899 j->gidmap = NULL;
Dylan Reid648b2202015-10-23 00:50:00 -0700900 j->mounts_head = NULL;
901 j->mounts_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800902 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500903
Elly Jonese1749eb2011-10-07 13:54:59 -0400904 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400905 char *user = consumestr(&serialized, &length);
906 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500907 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400908 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500909 if (!j->user)
910 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400911 }
Will Drewryf89aef52011-09-16 16:48:57 -0500912
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800913 if (j->suppl_gid_list) { /* stale pointer */
914 if (j->suppl_gid_count > NGROUPS_MAX) {
915 goto bad_gid_list;
916 }
917 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
918 void *gid_list_bytes =
919 consumebytes(gid_list_size, &serialized, &length);
920 if (!gid_list_bytes)
921 goto bad_gid_list;
922
923 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
924 if (!j->suppl_gid_list)
925 goto bad_gid_list;
926
927 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
928 }
929
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400930 if (j->chrootdir) { /* stale pointer */
931 char *chrootdir = consumestr(&serialized, &length);
932 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500933 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400934 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500935 if (!j->chrootdir)
936 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400937 }
938
Andrew Brestickereac28942015-11-11 16:04:46 -0800939 if (j->alt_syscall_table) { /* stale pointer */
940 char *alt_syscall_table = consumestr(&serialized, &length);
941 if (!alt_syscall_table)
942 goto bad_syscall_table;
943 j->alt_syscall_table = strdup(alt_syscall_table);
944 if (!j->alt_syscall_table)
945 goto bad_syscall_table;
946 }
947
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800948 if (j->flags.seccomp_filter && j->filter_len > 0) {
949 size_t ninstrs = j->filter_len;
950 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
951 ninstrs > USHRT_MAX)
952 goto bad_filters;
953
954 size_t program_len = ninstrs * sizeof(struct sock_filter);
955 void *program = consumebytes(program_len, &serialized, &length);
956 if (!program)
957 goto bad_filters;
958
959 j->filter_prog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800960 if (!j->filter_prog)
961 goto bad_filters;
962
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800963 j->filter_prog->len = ninstrs;
964 j->filter_prog->filter = malloc(program_len);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -0800965 if (!j->filter_prog->filter)
966 goto bad_filter_prog_instrs;
967
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800968 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400969 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400970
Dylan Reid648b2202015-10-23 00:50:00 -0700971 count = j->mounts_count;
972 j->mounts_count = 0;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400973 for (i = 0; i < count; ++i) {
Dylan Reid648b2202015-10-23 00:50:00 -0700974 unsigned long *flags;
Dylan Reid81e23972016-05-18 14:06:35 -0700975 int *has_data;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400976 const char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -0700977 const char *type;
Dylan Reid81e23972016-05-18 14:06:35 -0700978 const char *data = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400979 const char *src = consumestr(&serialized, &length);
980 if (!src)
Dylan Reid648b2202015-10-23 00:50:00 -0700981 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400982 dest = consumestr(&serialized, &length);
983 if (!dest)
Dylan Reid648b2202015-10-23 00:50:00 -0700984 goto bad_mounts;
985 type = consumestr(&serialized, &length);
986 if (!type)
987 goto bad_mounts;
Dylan Reid81e23972016-05-18 14:06:35 -0700988 has_data = consumebytes(sizeof(*has_data), &serialized,
989 &length);
990 if (!has_data)
991 goto bad_mounts;
992 if (*has_data) {
993 data = consumestr(&serialized, &length);
994 if (!data)
995 goto bad_mounts;
996 }
Dylan Reid648b2202015-10-23 00:50:00 -0700997 flags = consumebytes(sizeof(*flags), &serialized, &length);
998 if (!flags)
999 goto bad_mounts;
Dylan Reid81e23972016-05-18 14:06:35 -07001000 if (minijail_mount_with_data(j, src, dest, type, *flags, data))
Dylan Reid648b2202015-10-23 00:50:00 -07001001 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001002 }
1003
Dylan Reid605ce7f2016-01-19 19:21:00 -08001004 count = j->cgroup_count;
1005 j->cgroup_count = 0;
1006 for (i = 0; i < count; ++i) {
1007 char *cgroup = consumestr(&serialized, &length);
1008 if (!cgroup)
1009 goto bad_cgroups;
1010 j->cgroups[i] = strdup(cgroup);
1011 if (!j->cgroups[i])
1012 goto bad_cgroups;
1013 ++j->cgroup_count;
1014 }
1015
Elly Jonese1749eb2011-10-07 13:54:59 -04001016 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -05001017
Dylan Reid605ce7f2016-01-19 19:21:00 -08001018bad_cgroups:
1019 while (j->mounts_head) {
1020 struct mountpoint *m = j->mounts_head;
1021 j->mounts_head = j->mounts_head->next;
Dylan Reid81e23972016-05-18 14:06:35 -07001022 free(m->data);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001023 free(m->type);
1024 free(m->dest);
1025 free(m->src);
1026 free(m);
1027 }
1028 for (i = 0; i < j->cgroup_count; ++i)
1029 free(j->cgroups[i]);
Dylan Reid648b2202015-10-23 00:50:00 -07001030bad_mounts:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001031 if (j->flags.seccomp_filter && j->filter_len > 0) {
1032 free(j->filter_prog->filter);
1033 free(j->filter_prog);
1034 }
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001035bad_filter_prog_instrs:
1036 if (j->filter_prog)
1037 free(j->filter_prog);
Will Drewrybee7ba72011-10-21 20:47:01 -05001038bad_filters:
Andrew Brestickereac28942015-11-11 16:04:46 -08001039 if (j->alt_syscall_table)
1040 free(j->alt_syscall_table);
1041bad_syscall_table:
Will Drewrybee7ba72011-10-21 20:47:01 -05001042 if (j->chrootdir)
1043 free(j->chrootdir);
1044bad_chrootdir:
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001045 if (j->suppl_gid_list)
1046 free(j->suppl_gid_list);
1047bad_gid_list:
Will Drewrybee7ba72011-10-21 20:47:01 -05001048 if (j->user)
1049 free(j->user);
1050clear_pointers:
1051 j->user = NULL;
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001052 j->suppl_gid_list = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -05001053 j->chrootdir = NULL;
Andrew Brestickereac28942015-11-11 16:04:46 -08001054 j->alt_syscall_table = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -08001055 j->cgroup_count = 0;
Will Drewrybee7ba72011-10-21 20:47:01 -05001056out:
1057 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -05001058}
1059
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001060/*
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04001061 * setup_mount_destination: Ensures the mount target exists.
1062 * Creates it if needed and possible.
Dylan Reideec77962016-06-30 19:35:10 -07001063 */
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001064static int setup_mount_destination(const char *source, const char *dest,
1065 uid_t uid, uid_t gid)
Dylan Reideec77962016-06-30 19:35:10 -07001066{
1067 int rc;
1068 struct stat st_buf;
1069
1070 rc = stat(dest, &st_buf);
1071 if (rc == 0) /* destination exists */
1072 return 0;
1073
1074 /*
1075 * Try to create the destination.
1076 * Either make a directory or touch a file depending on the source type.
1077 * If the source doesn't exist, assume it is a filesystem type such as
1078 * "tmpfs" and create a directory to mount it on.
1079 */
1080 rc = stat(source, &st_buf);
1081 if (rc || S_ISDIR(st_buf.st_mode) || S_ISBLK(st_buf.st_mode)) {
1082 if (mkdir(dest, 0700))
1083 return -errno;
1084 } else {
1085 int fd = open(dest, O_RDWR | O_CREAT, 0700);
1086 if (fd < 0)
1087 return -errno;
1088 close(fd);
1089 }
1090 return chown(dest, uid, gid);
1091}
1092
1093/*
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001094 * mount_one: Applies mounts from @m for @j, recursing as needed.
Dylan Reid648b2202015-10-23 00:50:00 -07001095 * @j Minijail these mounts are for
1096 * @m Head of list of mounts
Elly Jones51a5b6c2011-10-12 19:09:26 -04001097 *
1098 * Returns 0 for success.
1099 */
Dylan Reid648b2202015-10-23 00:50:00 -07001100static int mount_one(const struct minijail *j, struct mountpoint *m)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001101{
Dylan Reid648b2202015-10-23 00:50:00 -07001102 int ret;
1103 char *dest;
1104 int remount_ro = 0;
1105
Jorge Lucangeli Obes2b12ba42016-01-26 10:37:51 -08001106 /* |dest| has a leading "/". */
Dylan Reid648b2202015-10-23 00:50:00 -07001107 if (asprintf(&dest, "%s%s", j->chrootdir, m->dest) < 0)
Elly Jones51a5b6c2011-10-12 19:09:26 -04001108 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -07001109
Dylan Reideec77962016-06-30 19:35:10 -07001110 if (setup_mount_destination(m->src, dest, j->uid, j->gid))
1111 pdie("creating mount target '%s' failed", dest);
1112
Dylan Reid648b2202015-10-23 00:50:00 -07001113 /*
Jorge Lucangeli Obes2b12ba42016-01-26 10:37:51 -08001114 * R/O bind mounts have to be remounted since 'bind' and 'ro'
1115 * can't both be specified in the original bind mount.
1116 * Remount R/O after the initial mount.
Dylan Reid648b2202015-10-23 00:50:00 -07001117 */
1118 if ((m->flags & MS_BIND) && (m->flags & MS_RDONLY)) {
1119 remount_ro = 1;
1120 m->flags &= ~MS_RDONLY;
Elly Jonesa1059632011-12-15 15:17:07 -05001121 }
Dylan Reid648b2202015-10-23 00:50:00 -07001122
Dylan Reid81e23972016-05-18 14:06:35 -07001123 ret = mount(m->src, dest, m->type, m->flags, m->data);
Dylan Reid648b2202015-10-23 00:50:00 -07001124 if (ret)
1125 pdie("mount: %s -> %s", m->src, dest);
1126
1127 if (remount_ro) {
1128 m->flags |= MS_RDONLY;
1129 ret = mount(m->src, dest, NULL,
Dylan Reid81e23972016-05-18 14:06:35 -07001130 m->flags | MS_REMOUNT, m->data);
Dylan Reid648b2202015-10-23 00:50:00 -07001131 if (ret)
1132 pdie("bind ro: %s -> %s", m->src, dest);
1133 }
1134
Elly Jones51a5b6c2011-10-12 19:09:26 -04001135 free(dest);
Dylan Reid648b2202015-10-23 00:50:00 -07001136 if (m->next)
1137 return mount_one(j, m->next);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001138 return ret;
1139}
1140
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001141static int enter_chroot(const struct minijail *j)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001142{
Elly Jones51a5b6c2011-10-12 19:09:26 -04001143 int ret;
Dylan Reid648b2202015-10-23 00:50:00 -07001144
1145 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
Elly Jones51a5b6c2011-10-12 19:09:26 -04001146 return ret;
1147
1148 if (chroot(j->chrootdir))
1149 return -errno;
1150
1151 if (chdir("/"))
1152 return -errno;
1153
1154 return 0;
1155}
1156
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001157static int enter_pivot_root(const struct minijail *j)
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001158{
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001159 int ret, oldroot, newroot;
Dylan Reid648b2202015-10-23 00:50:00 -07001160
1161 if (j->mounts_head && (ret = mount_one(j, j->mounts_head)))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001162 return ret;
1163
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001164 /*
1165 * Keep the fd for both old and new root.
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001166 * It will be used in fchdir(2) later.
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001167 */
Ricky Zhoubce609d2016-03-02 21:47:56 -08001168 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001169 if (oldroot < 0)
1170 pdie("failed to open / for fchdir");
Ricky Zhoubce609d2016-03-02 21:47:56 -08001171 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001172 if (newroot < 0)
1173 pdie("failed to open %s for fchdir", j->chrootdir);
1174
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001175 /*
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001176 * To ensure j->chrootdir is the root of a filesystem,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001177 * do a self bind mount.
1178 */
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001179 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
1180 pdie("failed to bind mount '%s'", j->chrootdir);
1181 if (chdir(j->chrootdir))
1182 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001183 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001184 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001185
1186 /*
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001187 * Now the old root is mounted on top of the new root. Use fchdir(2) to
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001188 * change to the old root and unmount it.
1189 */
1190 if (fchdir(oldroot))
1191 pdie("failed to fchdir to old /");
Hidehiko Abe097b7192016-03-16 18:00:36 +09001192
1193 /*
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07001194 * If j->flags.skip_remount_private was enabled for minijail_enter(),
1195 * there could be a shared mount point under |oldroot|. In that case,
1196 * mounts under this shared mount point will be unmounted below, and
1197 * this unmounting will propagate to the original mount namespace
1198 * (because the mount point is shared). To prevent this unexpected
1199 * unmounting, remove these mounts from their peer groups by recursively
1200 * remounting them as MS_PRIVATE.
Hidehiko Abe097b7192016-03-16 18:00:36 +09001201 */
1202 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07001203 pdie("failed to mount(/, private) before umount(/)");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001204 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08001205 if (umount2(".", MNT_DETACH))
1206 pdie("umount(/)");
1207 /* Change back to the new root. */
1208 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001209 return -errno;
Ricky Zhoubce609d2016-03-02 21:47:56 -08001210 if (close(oldroot))
1211 return -errno;
1212 if (close(newroot))
1213 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001214 if (chroot("/"))
1215 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -07001216 /* Set correct CWD for getcwd(3). */
1217 if (chdir("/"))
1218 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001219
1220 return 0;
1221}
1222
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001223static int mount_tmp(void)
Lee Campbell11af0622014-05-22 12:36:04 -07001224{
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001225 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
Lee Campbell11af0622014-05-22 12:36:04 -07001226}
1227
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001228static int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001229{
1230 const char *kProcPath = "/proc";
1231 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -05001232 /*
1233 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -04001234 * /proc in our namespace, which means using MS_REMOUNT here would
1235 * mutate our parent's mount as well, even though we're in a VFS
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07001236 * namespace (!). Instead, remove their mount from our namespace lazily
1237 * (MNT_DETACH) and make our own.
Elly Jonese1749eb2011-10-07 13:54:59 -04001238 */
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07001239 if (umount2(kProcPath, MNT_DETACH)) {
1240 /*
1241 * If we are in a new user namespace, umount(2) will fail.
1242 * See http://man7.org/linux/man-pages/man7/user_namespaces.7.html
1243 */
1244 if (j->flags.userns) {
1245 info("umount(/proc, MNT_DETACH) failed, "
1246 "this is expected when using user namespaces");
1247 } else {
1248 return -errno;
1249 }
1250 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001251 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
1252 return -errno;
1253 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -04001254}
1255
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001256static void kill_child_and_die(const struct minijail *j, const char *msg)
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001257{
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001258 kill(j->initpid, SIGKILL);
1259 die("%s", msg);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001260}
1261
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001262static void write_pid_file_or_die(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001263{
Keshav Santhanamdb6dab42016-08-10 16:33:34 -07001264 if (write_pid_to_path(j->initpid, j->pid_file_path))
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001265 kill_child_and_die(j, "failed to write pid file");
Dylan Reid605ce7f2016-01-19 19:21:00 -08001266}
1267
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001268static void add_to_cgroups_or_die(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08001269{
1270 size_t i;
1271
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001272 for (i = 0; i < j->cgroup_count; ++i) {
Keshav Santhanamdb6dab42016-08-10 16:33:34 -07001273 if (write_pid_to_path(j->initpid, j->cgroups[i]))
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001274 kill_child_and_die(j, "failed to add to cgroups");
1275 }
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001276}
1277
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001278static void write_ugid_maps_or_die(const struct minijail *j)
1279{
1280 if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0)
1281 kill_child_and_die(j, "failed to write uid_map");
Jorge Lucangeli Obes200299c2016-09-23 15:21:57 -04001282 if (j->gidmap && j->flags.disable_setgroups &&
1283 write_proc_file(j->initpid, "deny", "setgroups") != 0)
1284 kill_child_and_die(j, "failed to disable setgroups(2)");
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001285 if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0)
1286 kill_child_and_die(j, "failed to write gid_map");
1287}
1288
1289static void enter_user_namespace(const struct minijail *j)
1290{
1291 if (j->uidmap && setresuid(0, 0, 0))
1292 pdie("user_namespaces: setresuid(0, 0, 0) failed");
1293 if (j->gidmap && setresgid(0, 0, 0))
1294 pdie("user_namespaces: setresgid(0, 0, 0) failed");
1295}
1296
1297static void parent_setup_complete(int *pipe_fds)
1298{
1299 close(pipe_fds[0]);
1300 close(pipe_fds[1]);
1301}
1302
1303/*
1304 * wait_for_parent_setup: Called by the child process to wait for any
1305 * further parent-side setup to complete before continuing.
1306 */
1307static void wait_for_parent_setup(int *pipe_fds)
1308{
1309 char buf;
1310
1311 close(pipe_fds[1]);
1312
1313 /* Wait for parent to complete setup and close the pipe. */
1314 if (read(pipe_fds[0], &buf, 1) != 0)
1315 die("failed to sync with parent");
1316 close(pipe_fds[0]);
1317}
1318
1319static void drop_ugid(const struct minijail *j)
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001320{
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001321 if (j->flags.usergroups && j->flags.suppl_gids) {
1322 die("tried to inherit *and* set supplementary groups;"
1323 " can only do one");
1324 }
1325
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001326 if (j->flags.usergroups) {
1327 if (initgroups(j->user, j->usergid))
1328 pdie("initgroups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001329 } else if (j->flags.suppl_gids) {
1330 if (setgroups(j->suppl_gid_count, j->suppl_gid_list)) {
1331 pdie("setgroups");
1332 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001333 } else {
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001334 /*
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08001335 * Only attempt to clear supplementary groups if we are changing
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001336 * users.
1337 */
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001338 if ((j->uid || j->gid) && setgroups(0, NULL))
1339 pdie("setgroups");
1340 }
1341
1342 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
1343 pdie("setresgid");
1344
1345 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
1346 pdie("setresuid");
1347}
1348
Mike Frysinger3adfef72013-05-09 17:19:08 -04001349/*
1350 * We specifically do not use cap_valid() as that only tells us the last
1351 * valid cap we were *compiled* against (i.e. what the version of kernel
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001352 * headers says). If we run on a different kernel version, then it's not
Mike Frysinger3adfef72013-05-09 17:19:08 -04001353 * uncommon for that to be less (if an older kernel) or more (if a newer
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001354 * kernel).
1355 * Normally, we suck up the answer via /proc. On Android, not all processes are
1356 * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we
1357 * programmatically find the value by calling prctl(PR_CAPBSET_READ).
Mike Frysinger3adfef72013-05-09 17:19:08 -04001358 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -07001359static unsigned int get_last_valid_cap()
Mike Frysinger3adfef72013-05-09 17:19:08 -04001360{
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001361 unsigned int last_valid_cap = 0;
1362 if (is_android()) {
1363 for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0;
1364 ++last_valid_cap);
Mike Frysinger3adfef72013-05-09 17:19:08 -04001365
Jorge Lucangeli Obes4b276a62016-01-07 14:31:33 -08001366 /* |last_valid_cap| will be the first failing value. */
1367 if (last_valid_cap > 0) {
1368 last_valid_cap--;
1369 }
1370 } else {
1371 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
1372 FILE *fp = fopen(cap_file, "re");
1373 if (fscanf(fp, "%u", &last_valid_cap) != 1)
1374 pdie("fscanf(%s)", cap_file);
1375 fclose(fp);
1376 }
Dylan Reidf682d472015-09-17 21:39:07 -07001377 return last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -04001378}
1379
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08001380static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
1381{
1382 const uint64_t one = 1;
1383 unsigned int i;
1384 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
1385 if (keep_mask & (one << i))
1386 continue;
1387 if (prctl(PR_CAPBSET_DROP, i))
1388 pdie("could not drop capability from bounding set");
1389 }
1390}
1391
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001392static void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -04001393{
Jorge Lucangeli Obes7ea269e2016-02-26 22:07:09 -08001394 if (!j->flags.use_caps)
1395 return;
1396
Elly Jonese1749eb2011-10-07 13:54:59 -04001397 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -08001398 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -08001399 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -04001400 unsigned int i;
1401 if (!caps)
1402 die("can't get process caps");
1403 if (cap_clear_flag(caps, CAP_INHERITABLE))
1404 die("can't clear inheritable caps");
1405 if (cap_clear_flag(caps, CAP_EFFECTIVE))
1406 die("can't clear effective caps");
1407 if (cap_clear_flag(caps, CAP_PERMITTED))
1408 die("can't clear permitted caps");
Dylan Reidf682d472015-09-17 21:39:07 -07001409 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -08001410 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001411 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -04001412 continue;
Kees Cook323878a2013-02-05 15:35:24 -08001413 flag[0] = i;
1414 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001415 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -08001416 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001417 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -08001418 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04001419 die("can't add inheritable cap");
1420 }
1421 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -08001422 die("can't apply initial cleaned capset");
1423
1424 /*
1425 * Instead of dropping bounding set first, do it here in case
1426 * the caller had a more permissive bounding set which could
1427 * have been used above to raise a capability that wasn't already
1428 * present. This requires CAP_SETPCAP, so we raised/kept it above.
1429 */
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08001430 drop_capbset(j->caps, last_valid_cap);
Kees Cook323878a2013-02-05 15:35:24 -08001431
1432 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -08001433 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -08001434 flag[0] = CAP_SETPCAP;
1435 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
1436 die("can't clear effective cap");
1437 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
1438 die("can't clear permitted cap");
1439 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
1440 die("can't clear inheritable cap");
1441 }
1442
1443 if (cap_set_proc(caps))
1444 die("can't apply final cleaned capset");
1445
1446 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -04001447}
1448
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04001449static void set_seccomp_filter(const struct minijail *j)
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001450{
1451 /*
1452 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
1453 * in the kernel source tree for an explanation of the parameters.
1454 */
1455 if (j->flags.no_new_privs) {
1456 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
1457 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1458 }
1459
1460 /*
Jorge Lucangeli Obes2413f372016-04-06 18:43:10 -07001461 * Code running with ASan
1462 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
1463 * will make system calls not included in the syscall filter policy,
1464 * which will likely crash the program. Skip setting seccomp filter in
1465 * that case.
1466 * 'running_with_asan()' has no inputs and is completely defined at
1467 * build time, so this cannot be used by an attacker to skip setting
1468 * seccomp filter.
1469 */
1470 if (j->flags.seccomp_filter && running_with_asan()) {
1471 warn("running with ASan, not setting seccomp filter");
1472 return;
1473 }
1474
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04001475 if (j->flags.seccomp_filter) {
1476 if (j->flags.seccomp_filter_logging) {
1477 /*
1478 * If logging seccomp filter failures,
1479 * install the SIGSYS handler first.
1480 */
1481 if (install_sigsys_handler())
1482 pdie("failed to install SIGSYS handler");
1483 warn("logging seccomp filter failures");
1484 } else if (j->flags.seccomp_filter_tsync) {
1485 /*
1486 * If setting thread sync,
1487 * reset the SIGSYS signal handler so that
1488 * the entire thread group is killed.
1489 */
1490 if (signal(SIGSYS, SIG_DFL) == SIG_ERR)
1491 pdie("failed to reset SIGSYS disposition");
1492 info("reset SIGSYS disposition");
1493 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001494 }
1495
1496 /*
1497 * Install the syscall filter.
1498 */
1499 if (j->flags.seccomp_filter) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001500 if (j->flags.seccomp_filter_tsync) {
1501 if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
1502 SECCOMP_FILTER_FLAG_TSYNC,
1503 j->filter_prog)) {
1504 pdie("seccomp(tsync) failed");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001505 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001506 } else {
1507 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
1508 j->filter_prog)) {
1509 pdie("prctl(seccomp_filter) failed");
1510 }
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001511 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001512 }
1513}
1514
Will Drewry6ac91122011-10-21 16:38:58 -05001515void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001516{
Dylan Reidf682d472015-09-17 21:39:07 -07001517 /*
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001518 * If we're dropping caps, get the last valid cap from /proc now,
1519 * since /proc can be unmounted before drop_caps() is called.
Dylan Reidf682d472015-09-17 21:39:07 -07001520 */
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001521 unsigned int last_valid_cap = 0;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08001522 if (j->flags.capbset_drop || j->flags.use_caps)
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08001523 last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -07001524
Elly Jonese1749eb2011-10-07 13:54:59 -04001525 if (j->flags.pids)
1526 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001527 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04001528
Elly Jonese1749eb2011-10-07 13:54:59 -04001529 if (j->flags.usergroups && !j->user)
1530 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -04001531
Elly Jonesdd3e8512012-01-23 15:13:38 -05001532 /*
1533 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04001534 * so we don't even try. If any of our operations fail, we abort() the
1535 * entire process.
1536 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001537 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1538 pdie("setns(CLONE_NEWNS)");
1539
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001540 if (j->flags.vfs) {
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001541 if (unshare(CLONE_NEWNS))
1542 pdie("unshare(vfs)");
1543 /*
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001544 * Unless asked not to, remount all filesystems as private.
1545 * If they are shared, new bind mounts will creep out of our
1546 * namespace.
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001547 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1548 */
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001549 if (!j->flags.skip_remount_private) {
1550 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1551 pdie("mount(/, private)");
1552 }
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08001553 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001554
Dylan Reidf7942472015-11-18 17:55:26 -08001555 if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
1556 pdie("unshare(ipc)");
1557 }
1558
Dylan Reid1102f5a2015-09-15 11:52:20 -07001559 if (j->flags.enter_net) {
1560 if (setns(j->netns_fd, CLONE_NEWNET))
1561 pdie("setns(CLONE_NEWNET)");
1562 } else if (j->flags.net && unshare(CLONE_NEWNET)) {
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001563 pdie("unshare(net)");
Dylan Reid1102f5a2015-09-15 11:52:20 -07001564 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001565
Dylan Reid4cbc2a52016-06-17 19:06:07 -07001566 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
1567 pdie("unshare(cgroups)");
1568
Elly Jones51a5b6c2011-10-12 19:09:26 -04001569 if (j->flags.chroot && enter_chroot(j))
1570 pdie("chroot");
1571
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001572 if (j->flags.pivot_root && enter_pivot_root(j))
1573 pdie("pivot_root");
1574
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001575 if (j->flags.mount_tmp && mount_tmp())
Lee Campbell11af0622014-05-22 12:36:04 -07001576 pdie("mount_tmp");
1577
Dylan Reid791f5772015-09-14 20:02:42 -07001578 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04001579 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04001580
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08001581 /*
1582 * If we're only dropping capabilities from the bounding set, but not
1583 * from the thread's (permitted|inheritable|effective) sets, do it now.
1584 */
1585 if (j->flags.capbset_drop) {
1586 drop_capbset(j->cap_bset, last_valid_cap);
1587 }
1588
1589 if (j->flags.use_caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001590 /*
1591 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -04001592 * capability to change uids, our attempt to use setuid()
1593 * below will fail. Hang on to root caps across setuid(), then
1594 * lock securebits.
1595 */
1596 if (prctl(PR_SET_KEEPCAPS, 1))
1597 pdie("prctl(PR_SET_KEEPCAPS)");
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -07001598
1599 /*
1600 * Kernels 4.3+ define a new securebit
1601 * (SECURE_NO_CAP_AMBIENT_RAISE), so using the SECURE_ALL_BITS
1602 * and SECURE_ALL_LOCKS masks from newer kernel headers will
1603 * return EPERM on older kernels. Detect this, and retry with
1604 * the right mask for older (2.6.26-4.2) kernels.
1605 */
1606 int securebits_ret = prctl(PR_SET_SECUREBITS,
1607 SECURE_ALL_BITS | SECURE_ALL_LOCKS);
1608 if (securebits_ret < 0) {
1609 if (errno == EPERM) {
1610 /* Possibly running on kernel < 4.3. */
1611 securebits_ret = prctl(
1612 PR_SET_SECUREBITS,
1613 OLD_SECURE_ALL_BITS | OLD_SECURE_ALL_LOCKS);
1614 }
1615 }
1616 if (securebits_ret < 0)
Elly Jonese1749eb2011-10-07 13:54:59 -04001617 pdie("prctl(PR_SET_SECUREBITS)");
1618 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001619
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001620 if (j->flags.no_new_privs) {
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08001621 /*
1622 * If we're setting no_new_privs, we can drop privileges
1623 * before setting seccomp filter. This way filter policies
1624 * don't need to allow privilege-dropping syscalls.
1625 */
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001626 drop_ugid(j);
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08001627 drop_caps(j, last_valid_cap);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001628 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04001629 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001630 /*
1631 * If we're not setting no_new_privs,
1632 * we need to set seccomp filter *before* dropping privileges.
1633 * WARNING: this means that filter policies *must* allow
1634 * setgroups()/setresgid()/setresuid() for dropping root and
1635 * capget()/capset()/prctl() for dropping caps.
1636 */
1637 set_seccomp_filter(j);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001638 drop_ugid(j);
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08001639 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04001640 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001641
Elly Jonesdd3e8512012-01-23 15:13:38 -05001642 /*
Andrew Brestickereac28942015-11-11 16:04:46 -08001643 * Select the specified alternate syscall table. The table must not
1644 * block prctl(2) if we're using seccomp as well.
1645 */
1646 if (j->flags.alt_syscall) {
1647 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
1648 pdie("prctl(PR_ALT_SYSCALL)");
1649 }
1650
1651 /*
Elly Jonesdd3e8512012-01-23 15:13:38 -05001652 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04001653 * privilege-dropping syscalls :)
1654 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001655 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04001656 if ((errno == EINVAL) && seccomp_can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001657 warn("seccomp not supported");
1658 return;
1659 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001660 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001661 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001662}
1663
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04001664/* TODO(wad): will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04001665static int init_exitstatus = 0;
1666
Will Drewry6ac91122011-10-21 16:38:58 -05001667void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -04001668{
1669 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04001670}
1671
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04001672void init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04001673{
1674 pid_t pid;
1675 int status;
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04001676 /* So that we exit with the right status. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001677 signal(SIGTERM, init_term);
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04001678 /* TODO(wad): self jail with seccomp filters here. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001679 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001680 /*
1681 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04001682 * left inside our pid namespace or we get a signal.
1683 */
1684 if (pid == rootpid)
1685 init_exitstatus = status;
1686 }
1687 if (!WIFEXITED(init_exitstatus))
1688 _exit(MINIJAIL_ERR_INIT);
1689 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04001690}
1691
Will Drewry6ac91122011-10-21 16:38:58 -05001692int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001693{
1694 size_t sz = 0;
1695 size_t bytes = read(fd, &sz, sizeof(sz));
1696 char *buf;
1697 int r;
1698 if (sizeof(sz) != bytes)
1699 return -EINVAL;
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001700 if (sz > USHRT_MAX) /* arbitrary sanity check */
Elly Jonese1749eb2011-10-07 13:54:59 -04001701 return -E2BIG;
1702 buf = malloc(sz);
1703 if (!buf)
1704 return -ENOMEM;
1705 bytes = read(fd, buf, sz);
1706 if (bytes != sz) {
1707 free(buf);
1708 return -EINVAL;
1709 }
1710 r = minijail_unmarshal(j, buf, sz);
1711 free(buf);
1712 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001713}
1714
Will Drewry6ac91122011-10-21 16:38:58 -05001715int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04001716{
1717 char *buf;
1718 size_t sz = minijail_size(j);
1719 ssize_t written;
1720 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04001721
Elly Jonese1749eb2011-10-07 13:54:59 -04001722 if (!sz)
1723 return -EINVAL;
1724 buf = malloc(sz);
1725 r = minijail_marshal(j, buf, sz);
1726 if (r) {
1727 free(buf);
1728 return r;
1729 }
1730 /* Sends [size][minijail]. */
1731 written = write(fd, &sz, sizeof(sz));
1732 if (written != sizeof(sz)) {
1733 free(buf);
1734 return -EFAULT;
1735 }
1736 written = write(fd, buf, sz);
1737 if (written < 0 || (size_t) written != sz) {
1738 free(buf);
1739 return -EFAULT;
1740 }
1741 free(buf);
1742 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001743}
Elly Jonescd7a9042011-07-22 13:56:51 -04001744
Will Drewry6ac91122011-10-21 16:38:58 -05001745int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04001746{
Daniel Erat5b7a3182015-08-19 16:06:22 -06001747#if defined(__ANDROID__)
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001748 /* Don't use LDPRELOAD on Brillo. */
1749 return 0;
1750#else
Elly Jonese1749eb2011-10-07 13:54:59 -04001751 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1752 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1753 if (!newenv)
1754 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04001755
Elly Jonese1749eb2011-10-07 13:54:59 -04001756 /* Only insert a separating space if we have something to separate... */
1757 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1758 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04001759
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001760 /* setenv() makes a copy of the string we give it. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001761 setenv(kLdPreloadEnvVar, newenv, 1);
1762 free(newenv);
1763 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001764#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04001765}
1766
Will Drewry6ac91122011-10-21 16:38:58 -05001767int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04001768{
1769 int r = pipe(fds);
1770 char fd_buf[11];
1771 if (r)
1772 return r;
1773 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1774 if (r <= 0)
1775 return -EINVAL;
1776 setenv(kFdEnvVar, fd_buf, 1);
1777 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001778}
1779
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001780int setup_pipe_end(int fds[2], size_t index)
1781{
1782 if (index > 1)
1783 return -1;
1784
1785 close(fds[1 - index]);
1786 return fds[index];
1787}
1788
1789int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1790{
1791 if (index > 1)
1792 return -1;
1793
1794 close(fds[1 - index]);
1795 /* dup2(2) the corresponding end of the pipe into |fd|. */
1796 return dup2(fds[index], fd);
1797}
1798
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001799int minijail_run_internal(struct minijail *j, const char *filename,
1800 char *const argv[], pid_t *pchild_pid,
1801 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1802 int use_preload);
1803
Will Drewry6ac91122011-10-21 16:38:58 -05001804int API minijail_run(struct minijail *j, const char *filename,
1805 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04001806{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001807 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1808 true);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001809}
1810
1811int API minijail_run_pid(struct minijail *j, const char *filename,
1812 char *const argv[], pid_t *pchild_pid)
1813{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001814 return minijail_run_internal(j, filename, argv, pchild_pid,
1815 NULL, NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001816}
1817
1818int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001819 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001820{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001821 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1822 NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001823}
1824
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001825int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001826 char *const argv[], pid_t *pchild_pid,
1827 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001828{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001829 return minijail_run_internal(j, filename, argv, pchild_pid,
1830 pstdin_fd, pstdout_fd, pstderr_fd, true);
1831}
1832
1833int API minijail_run_no_preload(struct minijail *j, const char *filename,
1834 char *const argv[])
1835{
1836 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1837 false);
1838}
1839
Samuel Tan63187f42015-10-16 13:01:53 -07001840int API minijail_run_pid_pipes_no_preload(struct minijail *j,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001841 const char *filename,
1842 char *const argv[],
Samuel Tan63187f42015-10-16 13:01:53 -07001843 pid_t *pchild_pid,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08001844 int *pstdin_fd, int *pstdout_fd,
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001845 int *pstderr_fd)
1846{
Samuel Tan63187f42015-10-16 13:01:53 -07001847 return minijail_run_internal(j, filename, argv, pchild_pid,
1848 pstdin_fd, pstdout_fd, pstderr_fd, false);
1849}
1850
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001851int minijail_run_internal(struct minijail *j, const char *filename,
1852 char *const argv[], pid_t *pchild_pid,
1853 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1854 int use_preload)
1855{
Elly Jonese1749eb2011-10-07 13:54:59 -04001856 char *oldenv, *oldenv_copy = NULL;
1857 pid_t child_pid;
1858 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001859 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001860 int stdout_fds[2];
1861 int stderr_fds[2];
Dylan Reidce5b55e2016-01-13 11:04:16 -08001862 int child_sync_pipe_fds[2];
1863 int sync_child = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -04001864 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001865 /* We need to remember this across the minijail_preexec() call. */
1866 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001867 int do_init = j->flags.do_init;
Ben Chan541c7e52011-08-26 14:55:53 -07001868
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001869 if (use_preload) {
1870 oldenv = getenv(kLdPreloadEnvVar);
1871 if (oldenv) {
1872 oldenv_copy = strdup(oldenv);
1873 if (!oldenv_copy)
1874 return -ENOMEM;
1875 }
1876
1877 if (setup_preload())
1878 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04001879 }
Will Drewryf89aef52011-09-16 16:48:57 -05001880
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001881 if (!use_preload) {
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04001882 if (j->flags.use_caps && j->caps != 0)
1883 die("non-empty capabilities are not supported without LD_PRELOAD");
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001884 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05001885
Elly Jonesdd3e8512012-01-23 15:13:38 -05001886 /*
Jorge Lucangeli Obes3c84df12015-05-14 17:37:58 -07001887 * Make the process group ID of this process equal to its PID, so that
1888 * both the Minijail process and the jailed process can be killed
1889 * together.
1890 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1891 * the process is already a process group leader.
1892 */
1893 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1894 if (errno != EPERM) {
1895 pdie("setpgid(0, 0)");
1896 }
1897 }
1898
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001899 if (use_preload) {
1900 /*
1901 * Before we fork(2) and execve(2) the child process, we need
1902 * to open a pipe(2) to send the minijail configuration over.
1903 */
1904 if (setup_pipe(pipe_fds))
1905 return -EFAULT;
1906 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001907
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001908 /*
1909 * If we want to write to the child process' standard input,
1910 * create the pipe(2) now.
1911 */
1912 if (pstdin_fd) {
1913 if (pipe(stdin_fds))
1914 return -EFAULT;
1915 }
1916
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001917 /*
1918 * If we want to read from the child process' standard output,
1919 * create the pipe(2) now.
1920 */
1921 if (pstdout_fd) {
1922 if (pipe(stdout_fds))
1923 return -EFAULT;
1924 }
1925
1926 /*
1927 * If we want to read from the child process' standard error,
1928 * create the pipe(2) now.
1929 */
1930 if (pstderr_fd) {
1931 if (pipe(stderr_fds))
1932 return -EFAULT;
1933 }
1934
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001935 /*
Jorge Lucangeli Obesab6fa6f2016-08-04 15:42:48 -04001936 * If we want to set up a new uid/gid map in the user namespace,
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08001937 * or if we need to add the child process to cgroups, create the pipe(2)
1938 * to sync between parent and child.
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001939 */
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08001940 if (j->flags.userns || j->flags.cgroups) {
Dylan Reidce5b55e2016-01-13 11:04:16 -08001941 sync_child = 1;
1942 if (pipe(child_sync_pipe_fds))
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001943 return -EFAULT;
1944 }
1945
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001946 /*
1947 * Use sys_clone() if and only if we're creating a pid namespace.
Elly Jones761b7412012-06-13 15:49:52 -04001948 *
1949 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1950 *
1951 * In multithreaded programs, there are a bunch of locks inside libc,
1952 * some of which may be held by other threads at the time that we call
1953 * minijail_run_pid(). If we call fork(), glibc does its level best to
1954 * ensure that we hold all of these locks before it calls clone()
1955 * internally and drop them after clone() returns, but when we call
1956 * sys_clone(2) directly, all that gets bypassed and we end up with a
1957 * child address space where some of libc's important locks are held by
1958 * other threads (which did not get cloned, and hence will never release
1959 * those locks). This is okay so long as we call exec() immediately
1960 * after, but a bunch of seemingly-innocent libc functions like setenv()
1961 * take locks.
1962 *
1963 * Hence, only call sys_clone() if we need to, in order to get at pid
1964 * namespacing. If we follow this path, the child's address space might
1965 * have broken locks; you may only call functions that do not acquire
1966 * any locks.
1967 *
1968 * Unfortunately, fork() acquires every lock it can get its hands on, as
1969 * previously detailed, so this function is highly likely to deadlock
1970 * later on (see "deadlock here") if we're multithreaded.
1971 *
1972 * We might hack around this by having the clone()d child (init of the
1973 * pid namespace) return directly, rather than leaving the clone()d
1974 * process hanging around to be init for the new namespace (and having
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001975 * its fork()ed child return in turn), but that process would be
1976 * crippled with its libc locks potentially broken. We might try
1977 * fork()ing in the parent before we clone() to ensure that we own all
1978 * the locks, but then we have to have the forked child hanging around
1979 * consuming resources (and possibly having file descriptors / shared
1980 * memory regions / etc attached). We'd need to keep the child around to
1981 * avoid having its children get reparented to init.
Elly Jones761b7412012-06-13 15:49:52 -04001982 *
1983 * TODO(ellyjones): figure out if the "forked child hanging around"
1984 * problem is fixable or not. It would be nice if we worked in this
1985 * case.
1986 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001987 if (pid_namespace) {
1988 int clone_flags = CLONE_NEWPID | SIGCHLD;
1989 if (j->flags.userns)
1990 clone_flags |= CLONE_NEWUSER;
1991 child_pid = syscall(SYS_clone, clone_flags, NULL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001992 } else {
Elly Jones761b7412012-06-13 15:49:52 -04001993 child_pid = fork();
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001994 }
Elly Jones761b7412012-06-13 15:49:52 -04001995
Elly Jonese1749eb2011-10-07 13:54:59 -04001996 if (child_pid < 0) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001997 if (use_preload) {
1998 free(oldenv_copy);
1999 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07002000 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04002001 }
Will Drewryf89aef52011-09-16 16:48:57 -05002002
Elly Jonese1749eb2011-10-07 13:54:59 -04002003 if (child_pid) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002004 if (use_preload) {
2005 /* Restore parent's LD_PRELOAD. */
2006 if (oldenv_copy) {
2007 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
2008 free(oldenv_copy);
2009 } else {
2010 unsetenv(kLdPreloadEnvVar);
2011 }
2012 unsetenv(kFdEnvVar);
Elly Jonese1749eb2011-10-07 13:54:59 -04002013 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002014
Elly Jonese1749eb2011-10-07 13:54:59 -04002015 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002016
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08002017 if (j->flags.pid_file)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002018 write_pid_file_or_die(j);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08002019
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08002020 if (j->flags.cgroups)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002021 add_to_cgroups_or_die(j);
Dylan Reid605ce7f2016-01-19 19:21:00 -08002022
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002023 if (j->flags.userns)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002024 write_ugid_maps_or_die(j);
Dylan Reidce5b55e2016-01-13 11:04:16 -08002025
2026 if (sync_child)
2027 parent_setup_complete(child_sync_pipe_fds);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002028
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002029 if (use_preload) {
2030 /* Send marshalled minijail. */
2031 close(pipe_fds[0]); /* read endpoint */
2032 ret = minijail_to_fd(j, pipe_fds[1]);
2033 close(pipe_fds[1]); /* write endpoint */
2034 if (ret) {
2035 kill(j->initpid, SIGKILL);
2036 die("failed to send marshalled minijail");
2037 }
Elly Jonese1749eb2011-10-07 13:54:59 -04002038 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002039
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07002040 if (pchild_pid)
2041 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002042
2043 /*
2044 * If we want to write to the child process' standard input,
2045 * set up the write end of the pipe.
2046 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002047 if (pstdin_fd)
2048 *pstdin_fd = setup_pipe_end(stdin_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002049 1 /* write end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002050
2051 /*
2052 * If we want to read from the child process' standard output,
2053 * set up the read end of the pipe.
2054 */
2055 if (pstdout_fd)
2056 *pstdout_fd = setup_pipe_end(stdout_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002057 0 /* read end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002058
2059 /*
2060 * If we want to read from the child process' standard error,
2061 * set up the read end of the pipe.
2062 */
2063 if (pstderr_fd)
2064 *pstderr_fd = setup_pipe_end(stderr_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002065 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002066
Elly Jonese1749eb2011-10-07 13:54:59 -04002067 return 0;
2068 }
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002069 /* Child process. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002070 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07002071
Peter Qiu2860c462015-12-16 15:13:06 -08002072 if (j->flags.reset_signal_mask) {
2073 sigset_t signal_mask;
2074 if (sigemptyset(&signal_mask) != 0)
2075 pdie("sigemptyset failed");
2076 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
2077 pdie("sigprocmask failed");
2078 }
2079
Dylan Reidce5b55e2016-01-13 11:04:16 -08002080 if (sync_child)
2081 wait_for_parent_setup(child_sync_pipe_fds);
2082
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002083 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08002084 enter_user_namespace(j);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08002085
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002086 /*
2087 * If we want to write to the jailed process' standard input,
2088 * set up the read end of the pipe.
2089 */
2090 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002091 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
2092 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07002093 die("failed to set up stdin pipe");
2094 }
2095
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08002096 /*
2097 * If we want to read from the jailed process' standard output,
2098 * set up the write end of the pipe.
2099 */
2100 if (pstdout_fd) {
2101 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
2102 STDOUT_FILENO) < 0)
2103 die("failed to set up stdout pipe");
2104 }
2105
2106 /*
2107 * If we want to read from the jailed process' standard error,
2108 * set up the write end of the pipe.
2109 */
2110 if (pstderr_fd) {
2111 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
2112 STDERR_FILENO) < 0)
2113 die("failed to set up stderr pipe");
2114 }
2115
Dylan Reid791f5772015-09-14 20:02:42 -07002116 /* If running an init program, let it decide when/how to mount /proc. */
2117 if (pid_namespace && !do_init)
2118 j->flags.remount_proc_ro = 0;
2119
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002120 if (use_preload) {
2121 /* Strip out flags that cannot be inherited across execve(2). */
2122 minijail_preexec(j);
2123 } else {
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002124 /*
2125 * If not using LD_PRELOAD, do all jailing before execve(2).
2126 * Note that PID namespaces can only be entered on fork(2),
2127 * so that flag is still cleared.
2128 */
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002129 j->flags.pids = 0;
2130 }
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002131 /* Jail this process, then execve(2) the target. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002132 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04002133
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08002134 if (pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05002135 /*
2136 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08002137 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002138 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08002139 * a child to actually run the program. If |do_init == 0|, we
2140 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04002141 *
2142 * If we're multithreaded, we'll probably deadlock here. See
2143 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04002144 */
2145 child_pid = fork();
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002146 if (child_pid < 0) {
Elly Jonese1749eb2011-10-07 13:54:59 -04002147 _exit(child_pid);
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002148 } else if (child_pid > 0) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04002149 /*
2150 * Best effort. Don't bother checking the return value.
2151 */
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04002152 prctl(PR_SET_NAME, "minijail-init");
2153 init(child_pid); /* Never returns. */
2154 }
Elly Jonese1749eb2011-10-07 13:54:59 -04002155 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002156
Elly Jonesdd3e8512012-01-23 15:13:38 -05002157 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07002158 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04002159 * calling process
2160 * -> execve()-ing process
2161 * If we are:
2162 * calling process
2163 * -> init()-ing process
2164 * -> execve()-ing process
2165 */
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04002166 ret = execve(filename, argv, environ);
2167 if (ret == -1) {
2168 pwarn("execve(%s) failed", filename);
2169 }
2170 _exit(ret);
Elly Jonescd7a9042011-07-22 13:56:51 -04002171}
2172
Will Drewry6ac91122011-10-21 16:38:58 -05002173int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002174{
2175 int st;
2176 if (kill(j->initpid, SIGTERM))
2177 return -errno;
2178 if (waitpid(j->initpid, &st, 0) < 0)
2179 return -errno;
2180 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04002181}
2182
Will Drewry6ac91122011-10-21 16:38:58 -05002183int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002184{
2185 int st;
2186 if (waitpid(j->initpid, &st, 0) < 0)
2187 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08002188
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07002189 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07002190 int error_status = st;
2191 if (WIFSIGNALED(st)) {
2192 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07002193 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07002194 j->initpid, signum);
2195 /*
2196 * We return MINIJAIL_ERR_JAIL if the process received
2197 * SIGSYS, which happens when a syscall is blocked by
2198 * seccomp filters.
2199 * If not, we do what bash(1) does:
2200 * $? = 128 + signum
2201 */
2202 if (signum == SIGSYS) {
2203 error_status = MINIJAIL_ERR_JAIL;
2204 } else {
2205 error_status = 128 + signum;
2206 }
2207 }
2208 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07002209 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08002210
2211 int exit_status = WEXITSTATUS(st);
2212 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07002213 info("child process %d exited with status %d",
2214 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08002215
2216 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04002217}
2218
Will Drewry6ac91122011-10-21 16:38:58 -05002219void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002220{
Dylan Reid605ce7f2016-01-19 19:21:00 -08002221 size_t i;
2222
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08002223 if (j->flags.seccomp_filter && j->filter_prog) {
2224 free(j->filter_prog->filter);
2225 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04002226 }
Dylan Reid648b2202015-10-23 00:50:00 -07002227 while (j->mounts_head) {
2228 struct mountpoint *m = j->mounts_head;
2229 j->mounts_head = j->mounts_head->next;
Dylan Reid81e23972016-05-18 14:06:35 -07002230 free(m->data);
Dylan Reid648b2202015-10-23 00:50:00 -07002231 free(m->type);
2232 free(m->dest);
2233 free(m->src);
2234 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -04002235 }
Dylan Reid648b2202015-10-23 00:50:00 -07002236 j->mounts_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04002237 if (j->user)
2238 free(j->user);
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -08002239 if (j->suppl_gid_list)
2240 free(j->suppl_gid_list);
Will Drewrybee7ba72011-10-21 20:47:01 -05002241 if (j->chrootdir)
2242 free(j->chrootdir);
Jorge Lucangeli Obes3b2e6e42016-08-04 12:26:19 -04002243 if (j->pid_file_path)
2244 free(j->pid_file_path);
2245 if (j->uidmap)
2246 free(j->uidmap);
2247 if (j->gidmap)
2248 free(j->gidmap);
Andrew Brestickereac28942015-11-11 16:04:46 -08002249 if (j->alt_syscall_table)
2250 free(j->alt_syscall_table);
Dylan Reid605ce7f2016-01-19 19:21:00 -08002251 for (i = 0; i < j->cgroup_count; ++i)
2252 free(j->cgroups[i]);
Elly Jonese1749eb2011-10-07 13:54:59 -04002253 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04002254}