blob: e2af56a7f8ec3174081e2046a73814ec4b96f289 [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07008
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08009#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050010#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040011#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070012#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <grp.h>
14#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050015#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050020#include <stdarg.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070021#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080022#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040023#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050029#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040030#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070031#include <sys/stat.h>
32#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080033#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040034#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <unistd.h>
36
37#include "libminijail.h"
38#include "libminijail-private.h"
39
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070040#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080041#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070042#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080043
Lei Zhangeee31552012-10-17 21:27:10 -070044#ifdef HAVE_SECUREBITS_H
45#include <linux/securebits.h>
46#else
47#define SECURE_ALL_BITS 0x15
48#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
49#endif
50
Will Drewry32ac9f52011-08-18 21:36:27 -050051/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080052#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070053# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080054#endif
55
56/* For seccomp_filter using BPF. */
57#ifndef PR_SET_NO_NEW_PRIVS
58# define PR_SET_NO_NEW_PRIVS 38
59#endif
60#ifndef SECCOMP_MODE_FILTER
61# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050062#endif
63
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -070064#ifdef USE_SECCOMP_SOFTFAIL
65# define SECCOMP_SOFTFAIL 1
66#else
67# define SECCOMP_SOFTFAIL 0
68#endif
69
Elly Jones51a5b6c2011-10-12 19:09:26 -040070struct binding {
71 char *src;
72 char *dest;
73 int writeable;
74 struct binding *next;
75};
76
Will Drewryf89aef52011-09-16 16:48:57 -050077struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070078 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070079 * WARNING: if you add a flag here you need to make sure it's
80 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070081 */
Elly Jonese1749eb2011-10-07 13:54:59 -040082 struct {
83 int uid:1;
84 int gid:1;
85 int caps:1;
86 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070087 int enter_vfs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040088 int pids:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -040089 int net:1;
Dylan Reid1102f5a2015-09-15 11:52:20 -070090 int enter_net:1;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +080091 int userns:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040092 int seccomp:1;
Dylan Reid791f5772015-09-14 20:02:42 -070093 int remount_proc_ro:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040094 int usergroups:1;
95 int ptrace:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070096 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040097 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070098 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -040099 int chroot:1;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800100 int pivot_root:1;
Lee Campbell11af0622014-05-22 12:36:04 -0700101 int mount_tmp:1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800102 int do_init:1;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800103 int pid_file:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400104 } flags;
105 uid_t uid;
106 gid_t gid;
107 gid_t usergid;
108 char *user;
109 uint64_t caps;
110 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700111 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700112 int netns_fd;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800113 int filter_len;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400114 int binding_count;
115 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800116 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800117 char *uidmap;
118 char *gidmap;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800119 struct sock_fprog *filter_prog;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400120 struct binding *bindings_head;
121 struct binding *bindings_tail;
Will Drewryf89aef52011-09-16 16:48:57 -0500122};
123
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700124/*
125 * Strip out flags meant for the parent.
126 * We keep things that are not inherited across execve(2) (e.g. capabilities),
127 * or are easier to set after execve(2) (e.g. seccomp filters).
128 */
129void minijail_preenter(struct minijail *j)
130{
131 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700132 j->flags.enter_vfs = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700133 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700134 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800135 j->flags.do_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800136 j->flags.pid_file = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700137}
138
139/*
140 * Strip out flags meant for the child.
141 * We keep things that are inherited across execve(2).
142 */
143void minijail_preexec(struct minijail *j)
144{
145 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700146 int enter_vfs = j->flags.enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700147 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800148 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700149 if (j->user)
150 free(j->user);
151 j->user = NULL;
152 memset(&j->flags, 0, sizeof(j->flags));
153 /* Now restore anything we meant to keep. */
154 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700155 j->flags.enter_vfs = enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700156 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800157 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700158 /* Note, |pids| will already have been used before this call. */
159}
160
161/* Minijail API. */
162
Will Drewry6ac91122011-10-21 16:38:58 -0500163struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400164{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400165 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400166}
167
Will Drewry6ac91122011-10-21 16:38:58 -0500168void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400169{
170 if (uid == 0)
171 die("useless change to uid 0");
172 j->uid = uid;
173 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400174}
175
Will Drewry6ac91122011-10-21 16:38:58 -0500176void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400177{
178 if (gid == 0)
179 die("useless change to gid 0");
180 j->gid = gid;
181 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400182}
183
Will Drewry6ac91122011-10-21 16:38:58 -0500184int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400185{
186 char *buf = NULL;
187 struct passwd pw;
188 struct passwd *ppw = NULL;
189 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
190 if (sz == -1)
191 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400192
Elly Jonesdd3e8512012-01-23 15:13:38 -0500193 /*
194 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400195 * the maximum needed size of the buffer, so we don't have to search.
196 */
197 buf = malloc(sz);
198 if (!buf)
199 return -ENOMEM;
200 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500201 /*
202 * We're safe to free the buffer here. The strings inside pw point
203 * inside buf, but we don't use any of them; this leaves the pointers
204 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
205 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400206 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700207 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400208 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700209 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400210 minijail_change_uid(j, ppw->pw_uid);
211 j->user = strdup(user);
212 if (!j->user)
213 return -ENOMEM;
214 j->usergid = ppw->pw_gid;
215 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400216}
217
Will Drewry6ac91122011-10-21 16:38:58 -0500218int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400219{
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -0700220 char *buf = NULL;
Yabin Cui1b21c8f2015-07-22 10:34:45 -0700221 struct group gr;
222 struct group *pgr = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400223 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
224 if (sz == -1)
225 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400226
Elly Jonesdd3e8512012-01-23 15:13:38 -0500227 /*
228 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400229 * the maximum needed size of the buffer, so we don't have to search.
230 */
231 buf = malloc(sz);
232 if (!buf)
233 return -ENOMEM;
234 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500235 /*
236 * We're safe to free the buffer here. The strings inside gr point
237 * inside buf, but we don't use any of them; this leaves the pointers
238 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
239 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400240 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700241 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400242 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700243 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400244 minijail_change_gid(j, pgr->gr_gid);
245 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400246}
247
Will Drewry6ac91122011-10-21 16:38:58 -0500248void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400249{
250 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400251}
252
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700253void API minijail_no_new_privs(struct minijail *j)
254{
255 j->flags.no_new_privs = 1;
256}
257
Will Drewry6ac91122011-10-21 16:38:58 -0500258void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400259{
260 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500261}
262
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700263void API minijail_log_seccomp_filter_failures(struct minijail *j)
264{
265 j->flags.log_seccomp_filter = 1;
266}
267
Will Drewry6ac91122011-10-21 16:38:58 -0500268void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400269{
270 j->caps = capmask;
271 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400272}
273
Will Drewry6ac91122011-10-21 16:38:58 -0500274void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400275{
276 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400277}
278
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700279void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
280{
281 int ns_fd = open(ns_path, O_RDONLY);
282 if (ns_fd < 0) {
283 pdie("failed to open namespace '%s'", ns_path);
284 }
285 j->mountns_fd = ns_fd;
286 j->flags.enter_vfs = 1;
287}
288
Will Drewry6ac91122011-10-21 16:38:58 -0500289void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400290{
Elly Jonese58176c2012-01-23 11:46:17 -0500291 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700292 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400293 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800294 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400295}
296
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400297void API minijail_namespace_net(struct minijail *j)
298{
299 j->flags.net = 1;
300}
301
Dylan Reid1102f5a2015-09-15 11:52:20 -0700302void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
303{
304 int ns_fd = open(ns_path, O_RDONLY);
305 if (ns_fd < 0) {
306 pdie("failed to open namespace '%s'", ns_path);
307 }
308 j->netns_fd = ns_fd;
309 j->flags.enter_net = 1;
310}
311
Dylan Reid791f5772015-09-14 20:02:42 -0700312void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400313{
314 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700315 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400316}
317
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800318void API minijail_namespace_user(struct minijail *j)
319{
320 j->flags.userns = 1;
321}
322
323int API minijail_uidmap(struct minijail *j, const char *uidmap)
324{
325 j->uidmap = strdup(uidmap);
326 if (!j->uidmap)
327 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800328 char *ch;
329 for (ch = j->uidmap; *ch; ch++) {
330 if (*ch == ',')
331 *ch = '\n';
332 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800333 return 0;
334}
335
336int API minijail_gidmap(struct minijail *j, const char *gidmap)
337{
338 j->gidmap = strdup(gidmap);
339 if (!j->gidmap)
340 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800341 char *ch;
342 for (ch = j->gidmap; *ch; ch++) {
343 if (*ch == ',')
344 *ch = '\n';
345 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800346 return 0;
347}
348
Will Drewry6ac91122011-10-21 16:38:58 -0500349void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400350{
351 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400352}
353
Will Drewry6ac91122011-10-21 16:38:58 -0500354void API minijail_disable_ptrace(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400355{
356 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400357}
358
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800359void API minijail_run_as_init(struct minijail *j)
360{
361 /*
362 * Since the jailed program will become 'init' in the new PID namespace,
363 * Minijail does not need to fork an 'init' process.
364 */
365 j->flags.do_init = 0;
366}
367
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700368int API minijail_enter_chroot(struct minijail *j, const char *dir)
369{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400370 if (j->chrootdir)
371 return -EINVAL;
372 j->chrootdir = strdup(dir);
373 if (!j->chrootdir)
374 return -ENOMEM;
375 j->flags.chroot = 1;
376 return 0;
377}
378
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800379int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
380{
381 if (j->chrootdir)
382 return -EINVAL;
383 j->chrootdir = strdup(dir);
384 if (!j->chrootdir)
385 return -ENOMEM;
386 j->flags.pivot_root = 1;
387 return 0;
388}
389
Dylan Reida14e08d2015-10-22 21:05:29 -0700390static char *append_external_path(const char *external_path,
391 const char *path_inside_chroot)
Dylan Reid08946cc2015-09-16 19:10:57 -0700392{
Dylan Reida14e08d2015-10-22 21:05:29 -0700393 char *path;
Dylan Reid08946cc2015-09-16 19:10:57 -0700394 size_t pathlen;
395
Dylan Reid08946cc2015-09-16 19:10:57 -0700396 /* One extra char for '/' and one for '\0', hence + 2. */
Dylan Reida14e08d2015-10-22 21:05:29 -0700397 pathlen = strlen(path_inside_chroot) + strlen(external_path) + 2;
398 path = malloc(pathlen);
399 snprintf(path, pathlen, "%s/%s", external_path, path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700400
Dylan Reida14e08d2015-10-22 21:05:29 -0700401 return path;
402}
403
404char API *minijail_get_original_path(struct minijail *j,
405 const char *path_inside_chroot)
406{
407 struct binding *b;
408
409 b = j->bindings_head;
410 while (b) {
411 /*
412 * If |path_inside_chroot| is the exact destination of a
413 * bind mount, then the original path is exactly the source of
414 * the bind mount.
415 * for example: "-b /some/path/exe,/chroot/path/exe"
416 * bind source = /some/path/exe, bind dest = /chroot/path/exe
417 * Then when getting the original path of "/chroot/path/exe",
418 * the source of that bind mount, "/some/path/exe" is what
419 * should be returned.
420 */
421 if (!strcmp(b->dest, path_inside_chroot))
422 return strdup(b->src);
423
424 /*
425 * If |path_inside_chroot| is within the destination path of a
426 * bind mount, take the suffix of the chroot path relative to
427 * the bind mount destination path, and append it to the bind
428 * mount source path.
429 */
430 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
431 const char *relative_path =
432 path_inside_chroot + strlen(b->dest);
433 return append_external_path(b->src, relative_path);
434 }
435 b = b->next;
436 }
437
438 /* If there is a chroot path, append |path_inside_chroot| to that. */
439 if (j->chrootdir)
440 return append_external_path(j->chrootdir, path_inside_chroot);
441
442 /* No chroot, so the path outside is the same as it is inside. */
443 return strdup(path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700444}
445
Lee Campbell11af0622014-05-22 12:36:04 -0700446void API minijail_mount_tmp(struct minijail *j)
447{
448 j->flags.mount_tmp = 1;
449}
450
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800451int API minijail_write_pid_file(struct minijail *j, const char *path)
452{
453 j->pid_file_path = strdup(path);
454 if (!j->pid_file_path)
455 return -ENOMEM;
456 j->flags.pid_file = 1;
457 return 0;
458}
459
Will Drewry6ac91122011-10-21 16:38:58 -0500460int API minijail_bind(struct minijail *j, const char *src, const char *dest,
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700461 int writeable)
462{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400463 struct binding *b;
464
465 if (*dest != '/')
466 return -EINVAL;
467 b = calloc(1, sizeof(*b));
468 if (!b)
469 return -ENOMEM;
470 b->dest = strdup(dest);
471 if (!b->dest)
472 goto error;
473 b->src = strdup(src);
474 if (!b->src)
475 goto error;
476 b->writeable = writeable;
477
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700478 info("bind %s -> %s", src, dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400479
Elly Jonesdd3e8512012-01-23 15:13:38 -0500480 /*
481 * Force vfs namespacing so the bind mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400482 * containing vfs namespace.
483 */
484 minijail_namespace_vfs(j);
485
486 if (j->bindings_tail)
487 j->bindings_tail->next = b;
488 else
489 j->bindings_head = b;
490 j->bindings_tail = b;
491 j->binding_count++;
492
493 return 0;
494
495error:
496 free(b->src);
497 free(b->dest);
498 free(b);
499 return -ENOMEM;
500}
501
Will Drewry6ac91122011-10-21 16:38:58 -0500502void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400503{
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700504 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
505 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
506 warn("not loading seccomp filter, seccomp not supported");
507 return;
508 }
509 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400510 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800511 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700512 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400513 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800514
515 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700516 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
517 die("failed to compile seccomp filter BPF program in '%s'",
518 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800519 }
520
521 j->filter_len = fprog->len;
522 j->filter_prog = fprog;
523
Elly Jonese1749eb2011-10-07 13:54:59 -0400524 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500525}
526
Will Drewryf89aef52011-09-16 16:48:57 -0500527struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400528 size_t available;
529 size_t total;
530 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500531};
532
Will Drewry6ac91122011-10-21 16:38:58 -0500533void marshal_state_init(struct marshal_state *state,
534 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400535{
536 state->available = available;
537 state->buf = buf;
538 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500539}
540
Will Drewry6ac91122011-10-21 16:38:58 -0500541void marshal_append(struct marshal_state *state,
542 char *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400543{
544 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500545
Elly Jonese1749eb2011-10-07 13:54:59 -0400546 /* Up to |available| will be written. */
547 if (copy_len) {
548 memcpy(state->buf, src, copy_len);
549 state->buf += copy_len;
550 state->available -= copy_len;
551 }
552 /* |total| will contain the expected length. */
553 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500554}
555
Will Drewry6ac91122011-10-21 16:38:58 -0500556void minijail_marshal_helper(struct marshal_state *state,
557 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400558{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400559 struct binding *b = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400560 marshal_append(state, (char *)j, sizeof(*j));
561 if (j->user)
562 marshal_append(state, j->user, strlen(j->user) + 1);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400563 if (j->chrootdir)
564 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800565 if (j->flags.seccomp_filter && j->filter_prog) {
566 struct sock_fprog *fp = j->filter_prog;
567 marshal_append(state, (char *)fp->filter,
568 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400569 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400570 for (b = j->bindings_head; b; b = b->next) {
571 marshal_append(state, b->src, strlen(b->src) + 1);
572 marshal_append(state, b->dest, strlen(b->dest) + 1);
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700573 marshal_append(state, (char *)&b->writeable,
574 sizeof(b->writeable));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400575 }
Will Drewryf89aef52011-09-16 16:48:57 -0500576}
577
Will Drewry6ac91122011-10-21 16:38:58 -0500578size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400579{
580 struct marshal_state state;
581 marshal_state_init(&state, NULL, 0);
582 minijail_marshal_helper(&state, j);
583 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500584}
585
Elly Jonese1749eb2011-10-07 13:54:59 -0400586int minijail_marshal(const struct minijail *j, char *buf, size_t available)
587{
588 struct marshal_state state;
589 marshal_state_init(&state, buf, available);
590 minijail_marshal_helper(&state, j);
591 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500592}
593
Elly Jones51a5b6c2011-10-12 19:09:26 -0400594/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
595 * @length Number of bytes to consume
596 * @buf Buffer to consume from
597 * @buflength Size of @buf
598 *
599 * Returns a pointer to the base of the bytes, or NULL for errors.
600 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700601void *consumebytes(size_t length, char **buf, size_t *buflength)
602{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400603 char *p = *buf;
604 if (length > *buflength)
605 return NULL;
606 *buf += length;
607 *buflength -= length;
608 return p;
609}
610
611/* consumestr: consumes a C string from a buffer @buf of length @length
612 * @buf Buffer to consume
613 * @length Length of buffer
614 *
615 * Returns a pointer to the base of the string, or NULL for errors.
616 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700617char *consumestr(char **buf, size_t *buflength)
618{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400619 size_t len = strnlen(*buf, *buflength);
620 if (len == *buflength)
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -0700621 /* There's no null-terminator. */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400622 return NULL;
623 return consumebytes(len + 1, buf, buflength);
624}
625
Elly Jonese1749eb2011-10-07 13:54:59 -0400626int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
627{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400628 int i;
629 int count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500630 int ret = -EINVAL;
631
Elly Jonese1749eb2011-10-07 13:54:59 -0400632 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500633 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400634 memcpy((void *)j, serialized, sizeof(*j));
635 serialized += sizeof(*j);
636 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500637
Will Drewrybee7ba72011-10-21 20:47:01 -0500638 /* Potentially stale pointers not used as signals. */
639 j->bindings_head = NULL;
640 j->bindings_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800641 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500642
Elly Jonese1749eb2011-10-07 13:54:59 -0400643 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400644 char *user = consumestr(&serialized, &length);
645 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500646 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400647 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500648 if (!j->user)
649 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400650 }
Will Drewryf89aef52011-09-16 16:48:57 -0500651
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400652 if (j->chrootdir) { /* stale pointer */
653 char *chrootdir = consumestr(&serialized, &length);
654 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500655 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400656 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500657 if (!j->chrootdir)
658 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400659 }
660
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800661 if (j->flags.seccomp_filter && j->filter_len > 0) {
662 size_t ninstrs = j->filter_len;
663 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
664 ninstrs > USHRT_MAX)
665 goto bad_filters;
666
667 size_t program_len = ninstrs * sizeof(struct sock_filter);
668 void *program = consumebytes(program_len, &serialized, &length);
669 if (!program)
670 goto bad_filters;
671
672 j->filter_prog = malloc(sizeof(struct sock_fprog));
673 j->filter_prog->len = ninstrs;
674 j->filter_prog->filter = malloc(program_len);
675 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400676 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400677
678 count = j->binding_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400679 j->binding_count = 0;
680 for (i = 0; i < count; ++i) {
681 int *writeable;
682 const char *dest;
683 const char *src = consumestr(&serialized, &length);
684 if (!src)
Will Drewrybee7ba72011-10-21 20:47:01 -0500685 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400686 dest = consumestr(&serialized, &length);
687 if (!dest)
Will Drewrybee7ba72011-10-21 20:47:01 -0500688 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400689 writeable = consumebytes(sizeof(*writeable), &serialized, &length);
690 if (!writeable)
Will Drewrybee7ba72011-10-21 20:47:01 -0500691 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400692 if (minijail_bind(j, src, dest, *writeable))
Will Drewrybee7ba72011-10-21 20:47:01 -0500693 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400694 }
695
Elly Jonese1749eb2011-10-07 13:54:59 -0400696 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500697
698bad_bindings:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800699 if (j->flags.seccomp_filter && j->filter_len > 0) {
700 free(j->filter_prog->filter);
701 free(j->filter_prog);
702 }
Will Drewrybee7ba72011-10-21 20:47:01 -0500703bad_filters:
704 if (j->chrootdir)
705 free(j->chrootdir);
706bad_chrootdir:
707 if (j->user)
708 free(j->user);
709clear_pointers:
710 j->user = NULL;
711 j->chrootdir = NULL;
712out:
713 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500714}
715
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800716static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
717{
718 int fd, ret, len;
719 size_t sz;
720 char fname[32];
721 close(pipe_fds[0]);
722
723 sz = sizeof(fname);
724 if (j->uidmap) {
725 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700726 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800727 die("failed to write file name of uid_map");
728 fd = open(fname, O_WRONLY);
729 if (fd < 0)
730 pdie("failed to open '%s'", fname);
731 len = strlen(j->uidmap);
732 if (write(fd, j->uidmap, len) < len)
733 die("failed to set uid_map");
734 close(fd);
735 }
736 if (j->gidmap) {
737 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700738 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800739 die("failed to write file name of gid_map");
740 fd = open(fname, O_WRONLY);
741 if (fd < 0)
742 pdie("failed to open '%s'", fname);
743 len = strlen(j->gidmap);
744 if (write(fd, j->gidmap, len) < len)
745 die("failed to set gid_map");
746 close(fd);
747 }
748
749 close(pipe_fds[1]);
750}
751
752static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
753{
754 char buf;
755
756 close(pipe_fds[1]);
757
758 /* Wait for parent to set up uid/gid mappings. */
759 if (read(pipe_fds[0], &buf, 1) != 0)
760 die("failed to sync with parent");
761 close(pipe_fds[0]);
762
763 if (j->uidmap && setresuid(0, 0, 0))
764 pdie("setresuid");
765 if (j->gidmap && setresgid(0, 0, 0))
766 pdie("setresgid");
767}
768
Elly Jones51a5b6c2011-10-12 19:09:26 -0400769/* bind_one: Applies bindings from @b for @j, recursing as needed.
770 * @j Minijail these bindings are for
771 * @b Head of list of bindings
772 *
773 * Returns 0 for success.
774 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700775int bind_one(const struct minijail *j, struct binding *b)
776{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400777 int ret = 0;
778 char *dest = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400779 if (ret)
780 return ret;
781 /* dest has a leading "/" */
782 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
783 return -ENOMEM;
Elly Jonesa1059632011-12-15 15:17:07 -0500784 ret = mount(b->src, dest, NULL, MS_BIND, NULL);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400785 if (ret)
786 pdie("bind: %s -> %s", b->src, dest);
Elly Jonesa1059632011-12-15 15:17:07 -0500787 if (!b->writeable) {
788 ret = mount(b->src, dest, NULL,
Jorge Lucangeli Obes2f61ee42014-06-16 11:08:18 -0700789 MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
Elly Jonesa1059632011-12-15 15:17:07 -0500790 if (ret)
791 pdie("bind ro: %s -> %s", b->src, dest);
792 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400793 free(dest);
794 if (b->next)
795 return bind_one(j, b->next);
796 return ret;
797}
798
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700799int enter_chroot(const struct minijail *j)
800{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400801 int ret;
802 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
803 return ret;
804
805 if (chroot(j->chrootdir))
806 return -errno;
807
808 if (chdir("/"))
809 return -errno;
810
811 return 0;
812}
813
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800814int enter_pivot_root(const struct minijail *j)
815{
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800816 int ret, oldroot, newroot;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800817 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
818 return ret;
819
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800820 /* Keep the fd for both old and new root. It will be used in fchdir later. */
821 oldroot = open("/", O_DIRECTORY | O_RDONLY);
822 if (oldroot < 0)
823 pdie("failed to open / for fchdir");
824 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
825 if (newroot < 0)
826 pdie("failed to open %s for fchdir", j->chrootdir);
827
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800828 /* To ensure chrootdir is the root of a file system, do a self bind mount. */
829 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
830 pdie("failed to bind mount '%s'", j->chrootdir);
831 if (chdir(j->chrootdir))
832 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800833 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800834 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800835
836 /*
837 * Now the old root is mounted on top of the new root. Use fchdir to
838 * change to the old root and unmount it.
839 */
840 if (fchdir(oldroot))
841 pdie("failed to fchdir to old /");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800842 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800843 if (umount2(".", MNT_DETACH))
844 pdie("umount(/)");
845 /* Change back to the new root. */
846 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800847 return -errno;
848 if (chroot("/"))
849 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -0700850 /* Set correct CWD for getcwd(3). */
851 if (chdir("/"))
852 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800853
854 return 0;
855}
856
Lee Campbell11af0622014-05-22 12:36:04 -0700857int mount_tmp(void)
858{
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -0800859 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
Lee Campbell11af0622014-05-22 12:36:04 -0700860}
861
Dylan Reid791f5772015-09-14 20:02:42 -0700862int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400863{
864 const char *kProcPath = "/proc";
865 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -0500866 /*
867 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -0400868 * /proc in our namespace, which means using MS_REMOUNT here would
869 * mutate our parent's mount as well, even though we're in a VFS
870 * namespace (!). Instead, remove their mount from our namespace
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800871 * and make our own. However, if we are in a new user namespace, /proc
872 * is not seen as mounted, so don't return error if umount() fails.
Elly Jonese1749eb2011-10-07 13:54:59 -0400873 */
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -0700874 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
Elly Jonese1749eb2011-10-07 13:54:59 -0400875 return -errno;
876 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
877 return -errno;
878 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400879}
880
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800881static void write_pid_file(const struct minijail *j)
882{
883 FILE *fp = fopen(j->pid_file_path, "w");
884
885 if (!fp)
886 pdie("failed to open '%s'", j->pid_file_path);
887 if (fprintf(fp, "%d\n", (int)j->initpid) < 0)
888 pdie("fprintf(%s)", j->pid_file_path);
889 if (fclose(fp))
890 pdie("fclose(%s)", j->pid_file_path);
891}
892
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700893void drop_ugid(const struct minijail *j)
894{
895 if (j->flags.usergroups) {
896 if (initgroups(j->user, j->usergid))
897 pdie("initgroups");
898 } else {
899 /* Only attempt to clear supplemental groups if we are changing
900 * users. */
901 if ((j->uid || j->gid) && setgroups(0, NULL))
902 pdie("setgroups");
903 }
904
905 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
906 pdie("setresgid");
907
908 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
909 pdie("setresuid");
910}
911
Mike Frysinger3adfef72013-05-09 17:19:08 -0400912/*
913 * We specifically do not use cap_valid() as that only tells us the last
914 * valid cap we were *compiled* against (i.e. what the version of kernel
915 * headers says). If we run on a different kernel version, then it's not
916 * uncommon for that to be less (if an older kernel) or more (if a newer
917 * kernel). So suck up the answer via /proc.
918 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700919static unsigned int get_last_valid_cap()
Mike Frysinger3adfef72013-05-09 17:19:08 -0400920{
Dylan Reidf682d472015-09-17 21:39:07 -0700921 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
922 FILE *fp = fopen(cap_file, "re");
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700923 unsigned int last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -0400924
Dylan Reidf682d472015-09-17 21:39:07 -0700925 if (fscanf(fp, "%u", &last_valid_cap) != 1)
926 pdie("fscanf(%s)", cap_file);
927 fclose(fp);
Mike Frysinger3adfef72013-05-09 17:19:08 -0400928
Dylan Reidf682d472015-09-17 21:39:07 -0700929 return last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -0400930}
931
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700932void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -0400933{
934 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -0800935 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -0800936 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400937 unsigned int i;
938 if (!caps)
939 die("can't get process caps");
940 if (cap_clear_flag(caps, CAP_INHERITABLE))
941 die("can't clear inheritable caps");
942 if (cap_clear_flag(caps, CAP_EFFECTIVE))
943 die("can't clear effective caps");
944 if (cap_clear_flag(caps, CAP_PERMITTED))
945 die("can't clear permitted caps");
Dylan Reidf682d472015-09-17 21:39:07 -0700946 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -0800947 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800948 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -0400949 continue;
Kees Cook323878a2013-02-05 15:35:24 -0800950 flag[0] = i;
951 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400952 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -0800953 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400954 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -0800955 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400956 die("can't add inheritable cap");
957 }
958 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -0800959 die("can't apply initial cleaned capset");
960
961 /*
962 * Instead of dropping bounding set first, do it here in case
963 * the caller had a more permissive bounding set which could
964 * have been used above to raise a capability that wasn't already
965 * present. This requires CAP_SETPCAP, so we raised/kept it above.
966 */
Dylan Reidf682d472015-09-17 21:39:07 -0700967 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -0800968 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -0400969 continue;
970 if (prctl(PR_CAPBSET_DROP, i))
971 pdie("prctl(PR_CAPBSET_DROP)");
972 }
Kees Cook323878a2013-02-05 15:35:24 -0800973
974 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800975 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -0800976 flag[0] = CAP_SETPCAP;
977 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
978 die("can't clear effective cap");
979 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
980 die("can't clear permitted cap");
981 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
982 die("can't clear inheritable cap");
983 }
984
985 if (cap_set_proc(caps))
986 die("can't apply final cleaned capset");
987
988 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -0400989}
990
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700991void set_seccomp_filter(const struct minijail *j)
992{
993 /*
994 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
995 * in the kernel source tree for an explanation of the parameters.
996 */
997 if (j->flags.no_new_privs) {
998 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
999 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
1000 }
1001
1002 /*
1003 * If we're logging seccomp filter failures,
1004 * install the SIGSYS handler first.
1005 */
1006 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
1007 if (install_sigsys_handler())
1008 pdie("install SIGSYS handler");
1009 warn("logging seccomp filter failures");
1010 }
1011
1012 /*
1013 * Install the syscall filter.
1014 */
1015 if (j->flags.seccomp_filter) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001016 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
1017 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1018 warn("seccomp not supported");
1019 return;
1020 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001021 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001022 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001023 }
1024}
1025
Will Drewry6ac91122011-10-21 16:38:58 -05001026void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001027{
Dylan Reidf682d472015-09-17 21:39:07 -07001028 /*
1029 * Get the last valid cap from /proc, since /proc can be unmounted
1030 * before drop_caps().
1031 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -07001032 unsigned int last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -07001033
Elly Jonese1749eb2011-10-07 13:54:59 -04001034 if (j->flags.pids)
1035 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001036 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04001037
Elly Jonese1749eb2011-10-07 13:54:59 -04001038 if (j->flags.usergroups && !j->user)
1039 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -04001040
Elly Jonesdd3e8512012-01-23 15:13:38 -05001041 /*
1042 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04001043 * so we don't even try. If any of our operations fail, we abort() the
1044 * entire process.
1045 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001046 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1047 pdie("setns(CLONE_NEWNS)");
1048
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001049 if (j->flags.vfs) {
1050 if (unshare(CLONE_NEWNS))
1051 pdie("unshare(vfs)");
1052 /*
1053 * Remount all filesystems as private. If they are shared
1054 * new bind mounts will creep out of our namespace.
1055 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1056 */
1057 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1058 pdie("mount(/, private)");
1059 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001060
Dylan Reid1102f5a2015-09-15 11:52:20 -07001061 if (j->flags.enter_net) {
1062 if (setns(j->netns_fd, CLONE_NEWNET))
1063 pdie("setns(CLONE_NEWNET)");
1064 } else if (j->flags.net && unshare(CLONE_NEWNET)) {
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001065 pdie("unshare(net)");
Dylan Reid1102f5a2015-09-15 11:52:20 -07001066 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001067
Elly Jones51a5b6c2011-10-12 19:09:26 -04001068 if (j->flags.chroot && enter_chroot(j))
1069 pdie("chroot");
1070
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001071 if (j->flags.pivot_root && enter_pivot_root(j))
1072 pdie("pivot_root");
1073
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001074 if (j->flags.mount_tmp && mount_tmp())
Lee Campbell11af0622014-05-22 12:36:04 -07001075 pdie("mount_tmp");
1076
Dylan Reid791f5772015-09-14 20:02:42 -07001077 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04001078 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04001079
Elly Jonese1749eb2011-10-07 13:54:59 -04001080 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001081 /*
1082 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -04001083 * capability to change uids, our attempt to use setuid()
1084 * below will fail. Hang on to root caps across setuid(), then
1085 * lock securebits.
1086 */
1087 if (prctl(PR_SET_KEEPCAPS, 1))
1088 pdie("prctl(PR_SET_KEEPCAPS)");
1089 if (prctl
1090 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1091 pdie("prctl(PR_SET_SECUREBITS)");
1092 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001093
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001094 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001095 * If we're setting no_new_privs, we can drop privileges
1096 * before setting seccomp filter. This way filter policies
1097 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001098 */
1099 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001100 drop_ugid(j);
1101 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001102 drop_caps(j, last_valid_cap);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001103
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001104 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04001105 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001106 /*
1107 * If we're not setting no_new_privs,
1108 * we need to set seccomp filter *before* dropping privileges.
1109 * WARNING: this means that filter policies *must* allow
1110 * setgroups()/setresgid()/setresuid() for dropping root and
1111 * capget()/capset()/prctl() for dropping caps.
1112 */
1113 set_seccomp_filter(j);
1114
1115 drop_ugid(j);
1116 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001117 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04001118 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001119
Elly Jonesdd3e8512012-01-23 15:13:38 -05001120 /*
1121 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04001122 * privilege-dropping syscalls :)
1123 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001124 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1125 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1126 warn("seccomp not supported");
1127 return;
1128 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001129 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001130 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001131}
1132
Will Drewry6ac91122011-10-21 16:38:58 -05001133/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04001134static int init_exitstatus = 0;
1135
Will Drewry6ac91122011-10-21 16:38:58 -05001136void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -04001137{
1138 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04001139}
1140
Will Drewry6ac91122011-10-21 16:38:58 -05001141int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04001142{
1143 pid_t pid;
1144 int status;
1145 /* so that we exit with the right status */
1146 signal(SIGTERM, init_term);
1147 /* TODO(wad) self jail with seccomp_filters here. */
1148 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001149 /*
1150 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04001151 * left inside our pid namespace or we get a signal.
1152 */
1153 if (pid == rootpid)
1154 init_exitstatus = status;
1155 }
1156 if (!WIFEXITED(init_exitstatus))
1157 _exit(MINIJAIL_ERR_INIT);
1158 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04001159}
1160
Will Drewry6ac91122011-10-21 16:38:58 -05001161int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001162{
1163 size_t sz = 0;
1164 size_t bytes = read(fd, &sz, sizeof(sz));
1165 char *buf;
1166 int r;
1167 if (sizeof(sz) != bytes)
1168 return -EINVAL;
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001169 if (sz > USHRT_MAX) /* arbitrary sanity check */
Elly Jonese1749eb2011-10-07 13:54:59 -04001170 return -E2BIG;
1171 buf = malloc(sz);
1172 if (!buf)
1173 return -ENOMEM;
1174 bytes = read(fd, buf, sz);
1175 if (bytes != sz) {
1176 free(buf);
1177 return -EINVAL;
1178 }
1179 r = minijail_unmarshal(j, buf, sz);
1180 free(buf);
1181 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001182}
1183
Will Drewry6ac91122011-10-21 16:38:58 -05001184int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04001185{
1186 char *buf;
1187 size_t sz = minijail_size(j);
1188 ssize_t written;
1189 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04001190
Elly Jonese1749eb2011-10-07 13:54:59 -04001191 if (!sz)
1192 return -EINVAL;
1193 buf = malloc(sz);
1194 r = minijail_marshal(j, buf, sz);
1195 if (r) {
1196 free(buf);
1197 return r;
1198 }
1199 /* Sends [size][minijail]. */
1200 written = write(fd, &sz, sizeof(sz));
1201 if (written != sizeof(sz)) {
1202 free(buf);
1203 return -EFAULT;
1204 }
1205 written = write(fd, buf, sz);
1206 if (written < 0 || (size_t) written != sz) {
1207 free(buf);
1208 return -EFAULT;
1209 }
1210 free(buf);
1211 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001212}
Elly Jonescd7a9042011-07-22 13:56:51 -04001213
Will Drewry6ac91122011-10-21 16:38:58 -05001214int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04001215{
Daniel Erat5b7a3182015-08-19 16:06:22 -06001216#if defined(__ANDROID__)
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001217 /* Don't use LDPRELOAD on Brillo. */
1218 return 0;
1219#else
Elly Jonese1749eb2011-10-07 13:54:59 -04001220 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1221 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1222 if (!newenv)
1223 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04001224
Elly Jonese1749eb2011-10-07 13:54:59 -04001225 /* Only insert a separating space if we have something to separate... */
1226 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1227 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04001228
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001229 /* setenv() makes a copy of the string we give it. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001230 setenv(kLdPreloadEnvVar, newenv, 1);
1231 free(newenv);
1232 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001233#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04001234}
1235
Will Drewry6ac91122011-10-21 16:38:58 -05001236int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04001237{
1238 int r = pipe(fds);
1239 char fd_buf[11];
1240 if (r)
1241 return r;
1242 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1243 if (r <= 0)
1244 return -EINVAL;
1245 setenv(kFdEnvVar, fd_buf, 1);
1246 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001247}
1248
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001249int setup_pipe_end(int fds[2], size_t index)
1250{
1251 if (index > 1)
1252 return -1;
1253
1254 close(fds[1 - index]);
1255 return fds[index];
1256}
1257
1258int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1259{
1260 if (index > 1)
1261 return -1;
1262
1263 close(fds[1 - index]);
1264 /* dup2(2) the corresponding end of the pipe into |fd|. */
1265 return dup2(fds[index], fd);
1266}
1267
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001268int minijail_run_internal(struct minijail *j, const char *filename,
1269 char *const argv[], pid_t *pchild_pid,
1270 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1271 int use_preload);
1272
Will Drewry6ac91122011-10-21 16:38:58 -05001273int API minijail_run(struct minijail *j, const char *filename,
1274 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04001275{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001276 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1277 true);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001278}
1279
1280int API minijail_run_pid(struct minijail *j, const char *filename,
1281 char *const argv[], pid_t *pchild_pid)
1282{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001283 return minijail_run_internal(j, filename, argv, pchild_pid,
1284 NULL, NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001285}
1286
1287int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001288 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001289{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001290 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1291 NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001292}
1293
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001294int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001295 char *const argv[], pid_t *pchild_pid,
1296 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001297{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001298 return minijail_run_internal(j, filename, argv, pchild_pid,
1299 pstdin_fd, pstdout_fd, pstderr_fd, true);
1300}
1301
1302int API minijail_run_no_preload(struct minijail *j, const char *filename,
1303 char *const argv[])
1304{
1305 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1306 false);
1307}
1308
Samuel Tan63187f42015-10-16 13:01:53 -07001309int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1310 const char *filename, char *const argv[],
1311 pid_t *pchild_pid,
1312 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) {
1313 return minijail_run_internal(j, filename, argv, pchild_pid,
1314 pstdin_fd, pstdout_fd, pstderr_fd, false);
1315}
1316
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001317int minijail_run_internal(struct minijail *j, const char *filename,
1318 char *const argv[], pid_t *pchild_pid,
1319 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1320 int use_preload)
1321{
Elly Jonese1749eb2011-10-07 13:54:59 -04001322 char *oldenv, *oldenv_copy = NULL;
1323 pid_t child_pid;
1324 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001325 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001326 int stdout_fds[2];
1327 int stderr_fds[2];
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001328 int userns_pipe_fds[2];
Elly Jonese1749eb2011-10-07 13:54:59 -04001329 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001330 /* We need to remember this across the minijail_preexec() call. */
1331 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001332 int do_init = j->flags.do_init;
Ben Chan541c7e52011-08-26 14:55:53 -07001333
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001334 if (use_preload) {
1335 oldenv = getenv(kLdPreloadEnvVar);
1336 if (oldenv) {
1337 oldenv_copy = strdup(oldenv);
1338 if (!oldenv_copy)
1339 return -ENOMEM;
1340 }
1341
1342 if (setup_preload())
1343 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04001344 }
Will Drewryf89aef52011-09-16 16:48:57 -05001345
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001346 if (!use_preload) {
1347 if (j->flags.caps)
1348 die("Capabilities are not supported without "
1349 "LD_PRELOAD");
1350 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05001351
Elly Jonesdd3e8512012-01-23 15:13:38 -05001352 /*
Jorge Lucangeli Obes3c84df12015-05-14 17:37:58 -07001353 * Make the process group ID of this process equal to its PID, so that
1354 * both the Minijail process and the jailed process can be killed
1355 * together.
1356 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1357 * the process is already a process group leader.
1358 */
1359 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1360 if (errno != EPERM) {
1361 pdie("setpgid(0, 0)");
1362 }
1363 }
1364
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001365 if (use_preload) {
1366 /*
1367 * Before we fork(2) and execve(2) the child process, we need
1368 * to open a pipe(2) to send the minijail configuration over.
1369 */
1370 if (setup_pipe(pipe_fds))
1371 return -EFAULT;
1372 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001373
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001374 /*
1375 * If we want to write to the child process' standard input,
1376 * create the pipe(2) now.
1377 */
1378 if (pstdin_fd) {
1379 if (pipe(stdin_fds))
1380 return -EFAULT;
1381 }
1382
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001383 /*
1384 * If we want to read from the child process' standard output,
1385 * create the pipe(2) now.
1386 */
1387 if (pstdout_fd) {
1388 if (pipe(stdout_fds))
1389 return -EFAULT;
1390 }
1391
1392 /*
1393 * If we want to read from the child process' standard error,
1394 * create the pipe(2) now.
1395 */
1396 if (pstderr_fd) {
1397 if (pipe(stderr_fds))
1398 return -EFAULT;
1399 }
1400
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001401 /*
1402 * If we want to set up a new uid/gid mapping in the user namespace,
1403 * create the pipe(2) to sync between parent and child.
1404 */
1405 if (j->flags.userns) {
1406 if (pipe(userns_pipe_fds))
1407 return -EFAULT;
1408 }
1409
Elly Jones761b7412012-06-13 15:49:52 -04001410 /* Use sys_clone() if and only if we're creating a pid namespace.
1411 *
1412 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1413 *
1414 * In multithreaded programs, there are a bunch of locks inside libc,
1415 * some of which may be held by other threads at the time that we call
1416 * minijail_run_pid(). If we call fork(), glibc does its level best to
1417 * ensure that we hold all of these locks before it calls clone()
1418 * internally and drop them after clone() returns, but when we call
1419 * sys_clone(2) directly, all that gets bypassed and we end up with a
1420 * child address space where some of libc's important locks are held by
1421 * other threads (which did not get cloned, and hence will never release
1422 * those locks). This is okay so long as we call exec() immediately
1423 * after, but a bunch of seemingly-innocent libc functions like setenv()
1424 * take locks.
1425 *
1426 * Hence, only call sys_clone() if we need to, in order to get at pid
1427 * namespacing. If we follow this path, the child's address space might
1428 * have broken locks; you may only call functions that do not acquire
1429 * any locks.
1430 *
1431 * Unfortunately, fork() acquires every lock it can get its hands on, as
1432 * previously detailed, so this function is highly likely to deadlock
1433 * later on (see "deadlock here") if we're multithreaded.
1434 *
1435 * We might hack around this by having the clone()d child (init of the
1436 * pid namespace) return directly, rather than leaving the clone()d
1437 * process hanging around to be init for the new namespace (and having
1438 * its fork()ed child return in turn), but that process would be crippled
1439 * with its libc locks potentially broken. We might try fork()ing in the
1440 * parent before we clone() to ensure that we own all the locks, but
1441 * then we have to have the forked child hanging around consuming
1442 * resources (and possibly having file descriptors / shared memory
1443 * regions / etc attached). We'd need to keep the child around to avoid
1444 * having its children get reparented to init.
1445 *
1446 * TODO(ellyjones): figure out if the "forked child hanging around"
1447 * problem is fixable or not. It would be nice if we worked in this
1448 * case.
1449 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001450 if (pid_namespace) {
1451 int clone_flags = CLONE_NEWPID | SIGCHLD;
1452 if (j->flags.userns)
1453 clone_flags |= CLONE_NEWUSER;
1454 child_pid = syscall(SYS_clone, clone_flags, NULL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001455 } else {
Elly Jones761b7412012-06-13 15:49:52 -04001456 child_pid = fork();
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001457 }
Elly Jones761b7412012-06-13 15:49:52 -04001458
Elly Jonese1749eb2011-10-07 13:54:59 -04001459 if (child_pid < 0) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001460 if (use_preload) {
1461 free(oldenv_copy);
1462 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001463 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001464 }
Will Drewryf89aef52011-09-16 16:48:57 -05001465
Elly Jonese1749eb2011-10-07 13:54:59 -04001466 if (child_pid) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001467 if (use_preload) {
1468 /* Restore parent's LD_PRELOAD. */
1469 if (oldenv_copy) {
1470 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1471 free(oldenv_copy);
1472 } else {
1473 unsetenv(kLdPreloadEnvVar);
1474 }
1475 unsetenv(kFdEnvVar);
Elly Jonese1749eb2011-10-07 13:54:59 -04001476 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001477
Elly Jonese1749eb2011-10-07 13:54:59 -04001478 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001479
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001480 if (j->flags.pid_file)
1481 write_pid_file(j);
1482
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001483 if (j->flags.userns)
1484 write_ugid_mappings(j, userns_pipe_fds);
1485
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001486 if (use_preload) {
1487 /* Send marshalled minijail. */
1488 close(pipe_fds[0]); /* read endpoint */
1489 ret = minijail_to_fd(j, pipe_fds[1]);
1490 close(pipe_fds[1]); /* write endpoint */
1491 if (ret) {
1492 kill(j->initpid, SIGKILL);
1493 die("failed to send marshalled minijail");
1494 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001495 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001496
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001497 if (pchild_pid)
1498 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001499
1500 /*
1501 * If we want to write to the child process' standard input,
1502 * set up the write end of the pipe.
1503 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001504 if (pstdin_fd)
1505 *pstdin_fd = setup_pipe_end(stdin_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001506 1 /* write end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001507
1508 /*
1509 * If we want to read from the child process' standard output,
1510 * set up the read end of the pipe.
1511 */
1512 if (pstdout_fd)
1513 *pstdout_fd = setup_pipe_end(stdout_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001514 0 /* read end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001515
1516 /*
1517 * If we want to read from the child process' standard error,
1518 * set up the read end of the pipe.
1519 */
1520 if (pstderr_fd)
1521 *pstderr_fd = setup_pipe_end(stderr_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001522 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001523
Elly Jonese1749eb2011-10-07 13:54:59 -04001524 return 0;
1525 }
1526 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001527
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001528 if (j->flags.userns)
1529 enter_user_namespace(j, userns_pipe_fds);
1530
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001531 /*
1532 * If we want to write to the jailed process' standard input,
1533 * set up the read end of the pipe.
1534 */
1535 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001536 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1537 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001538 die("failed to set up stdin pipe");
1539 }
1540
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001541 /*
1542 * If we want to read from the jailed process' standard output,
1543 * set up the write end of the pipe.
1544 */
1545 if (pstdout_fd) {
1546 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1547 STDOUT_FILENO) < 0)
1548 die("failed to set up stdout pipe");
1549 }
1550
1551 /*
1552 * If we want to read from the jailed process' standard error,
1553 * set up the write end of the pipe.
1554 */
1555 if (pstderr_fd) {
1556 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1557 STDERR_FILENO) < 0)
1558 die("failed to set up stderr pipe");
1559 }
1560
Dylan Reid791f5772015-09-14 20:02:42 -07001561 /* If running an init program, let it decide when/how to mount /proc. */
1562 if (pid_namespace && !do_init)
1563 j->flags.remount_proc_ro = 0;
1564
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001565 if (use_preload) {
1566 /* Strip out flags that cannot be inherited across execve(2). */
1567 minijail_preexec(j);
1568 } else {
1569 j->flags.pids = 0;
1570 }
1571 /* Jail this process, then execve() the target. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001572 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001573
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001574 if (pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001575 /*
1576 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001577 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001578 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001579 * a child to actually run the program. If |do_init == 0|, we
1580 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04001581 *
1582 * If we're multithreaded, we'll probably deadlock here. See
1583 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001584 */
1585 child_pid = fork();
1586 if (child_pid < 0)
1587 _exit(child_pid);
1588 else if (child_pid > 0)
1589 init(child_pid); /* never returns */
1590 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001591
Elly Jonesdd3e8512012-01-23 15:13:38 -05001592 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001593 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04001594 * calling process
1595 * -> execve()-ing process
1596 * If we are:
1597 * calling process
1598 * -> init()-ing process
1599 * -> execve()-ing process
1600 */
1601 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001602}
1603
Will Drewry6ac91122011-10-21 16:38:58 -05001604int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001605{
1606 int st;
1607 if (kill(j->initpid, SIGTERM))
1608 return -errno;
1609 if (waitpid(j->initpid, &st, 0) < 0)
1610 return -errno;
1611 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001612}
1613
Will Drewry6ac91122011-10-21 16:38:58 -05001614int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001615{
1616 int st;
1617 if (waitpid(j->initpid, &st, 0) < 0)
1618 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001619
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001620 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001621 int error_status = st;
1622 if (WIFSIGNALED(st)) {
1623 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001624 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001625 j->initpid, signum);
1626 /*
1627 * We return MINIJAIL_ERR_JAIL if the process received
1628 * SIGSYS, which happens when a syscall is blocked by
1629 * seccomp filters.
1630 * If not, we do what bash(1) does:
1631 * $? = 128 + signum
1632 */
1633 if (signum == SIGSYS) {
1634 error_status = MINIJAIL_ERR_JAIL;
1635 } else {
1636 error_status = 128 + signum;
1637 }
1638 }
1639 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001640 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001641
1642 int exit_status = WEXITSTATUS(st);
1643 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001644 info("child process %d exited with status %d",
1645 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001646
1647 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001648}
1649
Will Drewry6ac91122011-10-21 16:38:58 -05001650void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001651{
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001652 if (j->flags.seccomp_filter && j->filter_prog) {
1653 free(j->filter_prog->filter);
1654 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001655 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001656 while (j->bindings_head) {
1657 struct binding *b = j->bindings_head;
1658 j->bindings_head = j->bindings_head->next;
1659 free(b->dest);
1660 free(b->src);
1661 free(b);
1662 }
1663 j->bindings_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001664 if (j->user)
1665 free(j->user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001666 if (j->chrootdir)
1667 free(j->chrootdir);
Elly Jonese1749eb2011-10-07 13:54:59 -04001668 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001669}