blob: bebdb8cb999063e0efb74666b8753e32e8fa220d [file] [log] [blame]
Jorge Lucangeli Obesd613ab22015-03-03 14:22:50 -08001/* Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07008
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08009#include <asm/unistd.h>
Will Drewry32ac9f52011-08-18 21:36:27 -050010#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040011#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070012#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <grp.h>
14#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050015#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040017#include <pwd.h>
18#include <sched.h>
19#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050020#include <stdarg.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070021#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080022#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040023#include <stdio.h>
24#include <stdlib.h>
25#include <string.h>
26#include <syscall.h>
27#include <sys/capability.h>
28#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050029#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040030#include <sys/prctl.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070031#include <sys/stat.h>
32#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080033#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040034#include <sys/wait.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <unistd.h>
36
37#include "libminijail.h"
38#include "libminijail-private.h"
39
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070040#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080041#include "syscall_filter.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070042#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080043
Lei Zhangeee31552012-10-17 21:27:10 -070044#ifdef HAVE_SECUREBITS_H
45#include <linux/securebits.h>
46#else
47#define SECURE_ALL_BITS 0x15
48#define SECURE_ALL_LOCKS (SECURE_ALL_BITS << 1)
49#endif
50
Will Drewry32ac9f52011-08-18 21:36:27 -050051/* Until these are reliably available in linux/prctl.h */
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080052#ifndef PR_SET_SECCOMP
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070053# define PR_SET_SECCOMP 22
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080054#endif
55
56/* For seccomp_filter using BPF. */
57#ifndef PR_SET_NO_NEW_PRIVS
58# define PR_SET_NO_NEW_PRIVS 38
59#endif
60#ifndef SECCOMP_MODE_FILTER
61# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
Will Drewry32ac9f52011-08-18 21:36:27 -050062#endif
63
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -070064#ifdef USE_SECCOMP_SOFTFAIL
65# define SECCOMP_SOFTFAIL 1
66#else
67# define SECCOMP_SOFTFAIL 0
68#endif
69
Elly Jones51a5b6c2011-10-12 19:09:26 -040070struct binding {
71 char *src;
72 char *dest;
73 int writeable;
74 struct binding *next;
75};
76
Will Drewryf89aef52011-09-16 16:48:57 -050077struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070078 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -070079 * WARNING: if you add a flag here you need to make sure it's
80 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -070081 */
Elly Jonese1749eb2011-10-07 13:54:59 -040082 struct {
83 int uid:1;
84 int gid:1;
85 int caps:1;
86 int vfs:1;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070087 int enter_vfs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040088 int pids:1;
Elly Fong-Jones6c086302013-03-20 17:15:28 -040089 int net:1;
Dylan Reid1102f5a2015-09-15 11:52:20 -070090 int enter_net:1;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +080091 int userns:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040092 int seccomp:1;
Dylan Reid791f5772015-09-14 20:02:42 -070093 int remount_proc_ro:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040094 int usergroups:1;
95 int ptrace:1;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -070096 int no_new_privs:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040097 int seccomp_filter:1;
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -070098 int log_seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -040099 int chroot:1;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800100 int pivot_root:1;
Lee Campbell11af0622014-05-22 12:36:04 -0700101 int mount_tmp:1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800102 int do_init:1;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800103 int pid_file:1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400104 } flags;
105 uid_t uid;
106 gid_t gid;
107 gid_t usergid;
108 char *user;
109 uint64_t caps;
110 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700111 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700112 int netns_fd;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800113 int filter_len;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400114 int binding_count;
115 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800116 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800117 char *uidmap;
118 char *gidmap;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800119 struct sock_fprog *filter_prog;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400120 struct binding *bindings_head;
121 struct binding *bindings_tail;
Will Drewryf89aef52011-09-16 16:48:57 -0500122};
123
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700124/*
125 * Strip out flags meant for the parent.
126 * We keep things that are not inherited across execve(2) (e.g. capabilities),
127 * or are easier to set after execve(2) (e.g. seccomp filters).
128 */
129void minijail_preenter(struct minijail *j)
130{
131 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700132 j->flags.enter_vfs = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700133 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700134 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800135 j->flags.do_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800136 j->flags.pid_file = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700137}
138
139/*
140 * Strip out flags meant for the child.
141 * We keep things that are inherited across execve(2).
142 */
143void minijail_preexec(struct minijail *j)
144{
145 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700146 int enter_vfs = j->flags.enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700147 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800148 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700149 if (j->user)
150 free(j->user);
151 j->user = NULL;
152 memset(&j->flags, 0, sizeof(j->flags));
153 /* Now restore anything we meant to keep. */
154 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700155 j->flags.enter_vfs = enter_vfs;
Dylan Reid791f5772015-09-14 20:02:42 -0700156 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800157 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700158 /* Note, |pids| will already have been used before this call. */
159}
160
161/* Minijail API. */
162
Will Drewry6ac91122011-10-21 16:38:58 -0500163struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400164{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400165 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -0400166}
167
Will Drewry6ac91122011-10-21 16:38:58 -0500168void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400169{
170 if (uid == 0)
171 die("useless change to uid 0");
172 j->uid = uid;
173 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400174}
175
Will Drewry6ac91122011-10-21 16:38:58 -0500176void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400177{
178 if (gid == 0)
179 die("useless change to gid 0");
180 j->gid = gid;
181 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400182}
183
Will Drewry6ac91122011-10-21 16:38:58 -0500184int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400185{
186 char *buf = NULL;
187 struct passwd pw;
188 struct passwd *ppw = NULL;
189 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
190 if (sz == -1)
191 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400192
Elly Jonesdd3e8512012-01-23 15:13:38 -0500193 /*
194 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400195 * the maximum needed size of the buffer, so we don't have to search.
196 */
197 buf = malloc(sz);
198 if (!buf)
199 return -ENOMEM;
200 getpwnam_r(user, &pw, buf, sz, &ppw);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500201 /*
202 * We're safe to free the buffer here. The strings inside pw point
203 * inside buf, but we don't use any of them; this leaves the pointers
204 * dangling but it's safe. ppw points at pw if getpwnam_r succeeded.
205 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400206 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700207 /* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400208 if (!ppw)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700209 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400210 minijail_change_uid(j, ppw->pw_uid);
211 j->user = strdup(user);
212 if (!j->user)
213 return -ENOMEM;
214 j->usergid = ppw->pw_gid;
215 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400216}
217
Will Drewry6ac91122011-10-21 16:38:58 -0500218int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400219{
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -0700220 char *buf = NULL;
Yabin Cui1b21c8f2015-07-22 10:34:45 -0700221 struct group gr;
222 struct group *pgr = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400223 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
224 if (sz == -1)
225 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400226
Elly Jonesdd3e8512012-01-23 15:13:38 -0500227 /*
228 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
Elly Jonese1749eb2011-10-07 13:54:59 -0400229 * the maximum needed size of the buffer, so we don't have to search.
230 */
231 buf = malloc(sz);
232 if (!buf)
233 return -ENOMEM;
234 getgrnam_r(group, &gr, buf, sz, &pgr);
Elly Jonesdd3e8512012-01-23 15:13:38 -0500235 /*
236 * We're safe to free the buffer here. The strings inside gr point
237 * inside buf, but we don't use any of them; this leaves the pointers
238 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
239 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400240 free(buf);
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700241 /* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
Elly Jonese1749eb2011-10-07 13:54:59 -0400242 if (!pgr)
Jorge Lucangeli Obes4e480652014-03-26 10:56:42 -0700243 return -1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400244 minijail_change_gid(j, pgr->gr_gid);
245 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400246}
247
Will Drewry6ac91122011-10-21 16:38:58 -0500248void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400249{
250 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400251}
252
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700253void API minijail_no_new_privs(struct minijail *j)
254{
255 j->flags.no_new_privs = 1;
256}
257
Will Drewry6ac91122011-10-21 16:38:58 -0500258void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400259{
260 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500261}
262
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700263void API minijail_log_seccomp_filter_failures(struct minijail *j)
264{
265 j->flags.log_seccomp_filter = 1;
266}
267
Will Drewry6ac91122011-10-21 16:38:58 -0500268void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400269{
270 j->caps = capmask;
271 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400272}
273
Will Drewry6ac91122011-10-21 16:38:58 -0500274void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400275{
276 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400277}
278
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700279void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
280{
281 int ns_fd = open(ns_path, O_RDONLY);
282 if (ns_fd < 0) {
283 pdie("failed to open namespace '%s'", ns_path);
284 }
285 j->mountns_fd = ns_fd;
286 j->flags.enter_vfs = 1;
287}
288
Will Drewry6ac91122011-10-21 16:38:58 -0500289void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400290{
Elly Jonese58176c2012-01-23 11:46:17 -0500291 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700292 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400293 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800294 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400295}
296
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400297void API minijail_namespace_net(struct minijail *j)
298{
299 j->flags.net = 1;
300}
301
Dylan Reid1102f5a2015-09-15 11:52:20 -0700302void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
303{
304 int ns_fd = open(ns_path, O_RDONLY);
305 if (ns_fd < 0) {
306 pdie("failed to open namespace '%s'", ns_path);
307 }
308 j->netns_fd = ns_fd;
309 j->flags.enter_net = 1;
310}
311
Dylan Reid791f5772015-09-14 20:02:42 -0700312void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400313{
314 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700315 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400316}
317
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800318void API minijail_namespace_user(struct minijail *j)
319{
320 j->flags.userns = 1;
321}
322
323int API minijail_uidmap(struct minijail *j, const char *uidmap)
324{
325 j->uidmap = strdup(uidmap);
326 if (!j->uidmap)
327 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800328 char *ch;
329 for (ch = j->uidmap; *ch; ch++) {
330 if (*ch == ',')
331 *ch = '\n';
332 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800333 return 0;
334}
335
336int API minijail_gidmap(struct minijail *j, const char *gidmap)
337{
338 j->gidmap = strdup(gidmap);
339 if (!j->gidmap)
340 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800341 char *ch;
342 for (ch = j->gidmap; *ch; ch++) {
343 if (*ch == ',')
344 *ch = '\n';
345 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800346 return 0;
347}
348
Will Drewry6ac91122011-10-21 16:38:58 -0500349void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400350{
351 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400352}
353
Will Drewry6ac91122011-10-21 16:38:58 -0500354void API minijail_disable_ptrace(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400355{
356 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400357}
358
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800359void API minijail_run_as_init(struct minijail *j)
360{
361 /*
362 * Since the jailed program will become 'init' in the new PID namespace,
363 * Minijail does not need to fork an 'init' process.
364 */
365 j->flags.do_init = 0;
366}
367
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700368int API minijail_enter_chroot(struct minijail *j, const char *dir)
369{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400370 if (j->chrootdir)
371 return -EINVAL;
372 j->chrootdir = strdup(dir);
373 if (!j->chrootdir)
374 return -ENOMEM;
375 j->flags.chroot = 1;
376 return 0;
377}
378
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800379int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
380{
381 if (j->chrootdir)
382 return -EINVAL;
383 j->chrootdir = strdup(dir);
384 if (!j->chrootdir)
385 return -ENOMEM;
386 j->flags.pivot_root = 1;
387 return 0;
388}
389
Dylan Reid08946cc2015-09-16 19:10:57 -0700390char *minijail_get_original_path(struct minijail *j, const char *chroot_path)
391{
392 char *external_path;
393 size_t pathlen;
394
395 if (!j->chrootdir)
396 return strdup(chroot_path);
397
398 /* One extra char for '/' and one for '\0', hence + 2. */
399 pathlen = strlen(chroot_path) + strlen(j->chrootdir) + 2;
400 external_path = malloc(pathlen);
401 snprintf(external_path, pathlen, "%s/%s", j->chrootdir, chroot_path);
402
403 return external_path;
404}
405
Lee Campbell11af0622014-05-22 12:36:04 -0700406void API minijail_mount_tmp(struct minijail *j)
407{
408 j->flags.mount_tmp = 1;
409}
410
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800411int API minijail_write_pid_file(struct minijail *j, const char *path)
412{
413 j->pid_file_path = strdup(path);
414 if (!j->pid_file_path)
415 return -ENOMEM;
416 j->flags.pid_file = 1;
417 return 0;
418}
419
Will Drewry6ac91122011-10-21 16:38:58 -0500420int API minijail_bind(struct minijail *j, const char *src, const char *dest,
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700421 int writeable)
422{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400423 struct binding *b;
424
425 if (*dest != '/')
426 return -EINVAL;
427 b = calloc(1, sizeof(*b));
428 if (!b)
429 return -ENOMEM;
430 b->dest = strdup(dest);
431 if (!b->dest)
432 goto error;
433 b->src = strdup(src);
434 if (!b->src)
435 goto error;
436 b->writeable = writeable;
437
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700438 info("bind %s -> %s", src, dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400439
Elly Jonesdd3e8512012-01-23 15:13:38 -0500440 /*
441 * Force vfs namespacing so the bind mounts don't leak out into the
Elly Jones51a5b6c2011-10-12 19:09:26 -0400442 * containing vfs namespace.
443 */
444 minijail_namespace_vfs(j);
445
446 if (j->bindings_tail)
447 j->bindings_tail->next = b;
448 else
449 j->bindings_head = b;
450 j->bindings_tail = b;
451 j->binding_count++;
452
453 return 0;
454
455error:
456 free(b->src);
457 free(b->dest);
458 free(b);
459 return -ENOMEM;
460}
461
Dylan Reid08946cc2015-09-16 19:10:57 -0700462int API minijail_has_bind_mounts(const struct minijail *j)
463{
464 return j->bindings_head != NULL;
465}
466
Will Drewry6ac91122011-10-21 16:38:58 -0500467void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
Elly Jonese1749eb2011-10-07 13:54:59 -0400468{
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700469 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL)) {
470 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
471 warn("not loading seccomp filter, seccomp not supported");
472 return;
473 }
474 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400475 FILE *file = fopen(path, "r");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800476 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -0700477 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -0400478 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800479
480 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700481 if (compile_filter(file, fprog, j->flags.log_seccomp_filter)) {
482 die("failed to compile seccomp filter BPF program in '%s'",
483 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800484 }
485
486 j->filter_len = fprog->len;
487 j->filter_prog = fprog;
488
Elly Jonese1749eb2011-10-07 13:54:59 -0400489 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500490}
491
Will Drewryf89aef52011-09-16 16:48:57 -0500492struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400493 size_t available;
494 size_t total;
495 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500496};
497
Will Drewry6ac91122011-10-21 16:38:58 -0500498void marshal_state_init(struct marshal_state *state,
499 char *buf, size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -0400500{
501 state->available = available;
502 state->buf = buf;
503 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500504}
505
Will Drewry6ac91122011-10-21 16:38:58 -0500506void marshal_append(struct marshal_state *state,
507 char *src, size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -0400508{
509 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500510
Elly Jonese1749eb2011-10-07 13:54:59 -0400511 /* Up to |available| will be written. */
512 if (copy_len) {
513 memcpy(state->buf, src, copy_len);
514 state->buf += copy_len;
515 state->available -= copy_len;
516 }
517 /* |total| will contain the expected length. */
518 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500519}
520
Will Drewry6ac91122011-10-21 16:38:58 -0500521void minijail_marshal_helper(struct marshal_state *state,
522 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400523{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400524 struct binding *b = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400525 marshal_append(state, (char *)j, sizeof(*j));
526 if (j->user)
527 marshal_append(state, j->user, strlen(j->user) + 1);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400528 if (j->chrootdir)
529 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800530 if (j->flags.seccomp_filter && j->filter_prog) {
531 struct sock_fprog *fp = j->filter_prog;
532 marshal_append(state, (char *)fp->filter,
533 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -0400534 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400535 for (b = j->bindings_head; b; b = b->next) {
536 marshal_append(state, b->src, strlen(b->src) + 1);
537 marshal_append(state, b->dest, strlen(b->dest) + 1);
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700538 marshal_append(state, (char *)&b->writeable,
539 sizeof(b->writeable));
Elly Jones51a5b6c2011-10-12 19:09:26 -0400540 }
Will Drewryf89aef52011-09-16 16:48:57 -0500541}
542
Will Drewry6ac91122011-10-21 16:38:58 -0500543size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400544{
545 struct marshal_state state;
546 marshal_state_init(&state, NULL, 0);
547 minijail_marshal_helper(&state, j);
548 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500549}
550
Elly Jonese1749eb2011-10-07 13:54:59 -0400551int minijail_marshal(const struct minijail *j, char *buf, size_t available)
552{
553 struct marshal_state state;
554 marshal_state_init(&state, buf, available);
555 minijail_marshal_helper(&state, j);
556 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500557}
558
Elly Jones51a5b6c2011-10-12 19:09:26 -0400559/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
560 * @length Number of bytes to consume
561 * @buf Buffer to consume from
562 * @buflength Size of @buf
563 *
564 * Returns a pointer to the base of the bytes, or NULL for errors.
565 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700566void *consumebytes(size_t length, char **buf, size_t *buflength)
567{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400568 char *p = *buf;
569 if (length > *buflength)
570 return NULL;
571 *buf += length;
572 *buflength -= length;
573 return p;
574}
575
576/* consumestr: consumes a C string from a buffer @buf of length @length
577 * @buf Buffer to consume
578 * @length Length of buffer
579 *
580 * Returns a pointer to the base of the string, or NULL for errors.
581 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700582char *consumestr(char **buf, size_t *buflength)
583{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400584 size_t len = strnlen(*buf, *buflength);
585 if (len == *buflength)
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -0700586 /* There's no null-terminator. */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400587 return NULL;
588 return consumebytes(len + 1, buf, buflength);
589}
590
Elly Jonese1749eb2011-10-07 13:54:59 -0400591int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
592{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400593 int i;
594 int count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500595 int ret = -EINVAL;
596
Elly Jonese1749eb2011-10-07 13:54:59 -0400597 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500598 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400599 memcpy((void *)j, serialized, sizeof(*j));
600 serialized += sizeof(*j);
601 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500602
Will Drewrybee7ba72011-10-21 20:47:01 -0500603 /* Potentially stale pointers not used as signals. */
604 j->bindings_head = NULL;
605 j->bindings_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800606 j->filter_prog = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -0500607
Elly Jonese1749eb2011-10-07 13:54:59 -0400608 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400609 char *user = consumestr(&serialized, &length);
610 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500611 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400612 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500613 if (!j->user)
614 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400615 }
Will Drewryf89aef52011-09-16 16:48:57 -0500616
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400617 if (j->chrootdir) { /* stale pointer */
618 char *chrootdir = consumestr(&serialized, &length);
619 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500620 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400621 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500622 if (!j->chrootdir)
623 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400624 }
625
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800626 if (j->flags.seccomp_filter && j->filter_len > 0) {
627 size_t ninstrs = j->filter_len;
628 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
629 ninstrs > USHRT_MAX)
630 goto bad_filters;
631
632 size_t program_len = ninstrs * sizeof(struct sock_filter);
633 void *program = consumebytes(program_len, &serialized, &length);
634 if (!program)
635 goto bad_filters;
636
637 j->filter_prog = malloc(sizeof(struct sock_fprog));
638 j->filter_prog->len = ninstrs;
639 j->filter_prog->filter = malloc(program_len);
640 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -0400641 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400642
643 count = j->binding_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400644 j->binding_count = 0;
645 for (i = 0; i < count; ++i) {
646 int *writeable;
647 const char *dest;
648 const char *src = consumestr(&serialized, &length);
649 if (!src)
Will Drewrybee7ba72011-10-21 20:47:01 -0500650 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400651 dest = consumestr(&serialized, &length);
652 if (!dest)
Will Drewrybee7ba72011-10-21 20:47:01 -0500653 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400654 writeable = consumebytes(sizeof(*writeable), &serialized, &length);
655 if (!writeable)
Will Drewrybee7ba72011-10-21 20:47:01 -0500656 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400657 if (minijail_bind(j, src, dest, *writeable))
Will Drewrybee7ba72011-10-21 20:47:01 -0500658 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400659 }
660
Elly Jonese1749eb2011-10-07 13:54:59 -0400661 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500662
663bad_bindings:
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800664 if (j->flags.seccomp_filter && j->filter_len > 0) {
665 free(j->filter_prog->filter);
666 free(j->filter_prog);
667 }
Will Drewrybee7ba72011-10-21 20:47:01 -0500668bad_filters:
669 if (j->chrootdir)
670 free(j->chrootdir);
671bad_chrootdir:
672 if (j->user)
673 free(j->user);
674clear_pointers:
675 j->user = NULL;
676 j->chrootdir = NULL;
677out:
678 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500679}
680
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800681static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
682{
683 int fd, ret, len;
684 size_t sz;
685 char fname[32];
686 close(pipe_fds[0]);
687
688 sz = sizeof(fname);
689 if (j->uidmap) {
690 ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700691 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800692 die("failed to write file name of uid_map");
693 fd = open(fname, O_WRONLY);
694 if (fd < 0)
695 pdie("failed to open '%s'", fname);
696 len = strlen(j->uidmap);
697 if (write(fd, j->uidmap, len) < len)
698 die("failed to set uid_map");
699 close(fd);
700 }
701 if (j->gidmap) {
702 ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700703 if (ret < 0 || (size_t)ret >= sz)
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800704 die("failed to write file name of gid_map");
705 fd = open(fname, O_WRONLY);
706 if (fd < 0)
707 pdie("failed to open '%s'", fname);
708 len = strlen(j->gidmap);
709 if (write(fd, j->gidmap, len) < len)
710 die("failed to set gid_map");
711 close(fd);
712 }
713
714 close(pipe_fds[1]);
715}
716
717static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
718{
719 char buf;
720
721 close(pipe_fds[1]);
722
723 /* Wait for parent to set up uid/gid mappings. */
724 if (read(pipe_fds[0], &buf, 1) != 0)
725 die("failed to sync with parent");
726 close(pipe_fds[0]);
727
728 if (j->uidmap && setresuid(0, 0, 0))
729 pdie("setresuid");
730 if (j->gidmap && setresgid(0, 0, 0))
731 pdie("setresgid");
732}
733
Elly Jones51a5b6c2011-10-12 19:09:26 -0400734/* bind_one: Applies bindings from @b for @j, recursing as needed.
735 * @j Minijail these bindings are for
736 * @b Head of list of bindings
737 *
738 * Returns 0 for success.
739 */
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700740int bind_one(const struct minijail *j, struct binding *b)
741{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400742 int ret = 0;
743 char *dest = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400744 if (ret)
745 return ret;
746 /* dest has a leading "/" */
747 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
748 return -ENOMEM;
Elly Jonesa1059632011-12-15 15:17:07 -0500749 ret = mount(b->src, dest, NULL, MS_BIND, NULL);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400750 if (ret)
751 pdie("bind: %s -> %s", b->src, dest);
Elly Jonesa1059632011-12-15 15:17:07 -0500752 if (!b->writeable) {
753 ret = mount(b->src, dest, NULL,
Jorge Lucangeli Obes2f61ee42014-06-16 11:08:18 -0700754 MS_BIND | MS_REMOUNT | MS_RDONLY, NULL);
Elly Jonesa1059632011-12-15 15:17:07 -0500755 if (ret)
756 pdie("bind ro: %s -> %s", b->src, dest);
757 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400758 free(dest);
759 if (b->next)
760 return bind_one(j, b->next);
761 return ret;
762}
763
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700764int enter_chroot(const struct minijail *j)
765{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400766 int ret;
767 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
768 return ret;
769
770 if (chroot(j->chrootdir))
771 return -errno;
772
773 if (chdir("/"))
774 return -errno;
775
776 return 0;
777}
778
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800779int enter_pivot_root(const struct minijail *j)
780{
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800781 int ret, oldroot, newroot;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800782 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
783 return ret;
784
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800785 /* Keep the fd for both old and new root. It will be used in fchdir later. */
786 oldroot = open("/", O_DIRECTORY | O_RDONLY);
787 if (oldroot < 0)
788 pdie("failed to open / for fchdir");
789 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY);
790 if (newroot < 0)
791 pdie("failed to open %s for fchdir", j->chrootdir);
792
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800793 /* To ensure chrootdir is the root of a file system, do a self bind mount. */
794 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
795 pdie("failed to bind mount '%s'", j->chrootdir);
796 if (chdir(j->chrootdir))
797 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800798 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800799 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800800
801 /*
802 * Now the old root is mounted on top of the new root. Use fchdir to
803 * change to the old root and unmount it.
804 */
805 if (fchdir(oldroot))
806 pdie("failed to fchdir to old /");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800807 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +0800808 if (umount2(".", MNT_DETACH))
809 pdie("umount(/)");
810 /* Change back to the new root. */
811 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800812 return -errno;
813 if (chroot("/"))
814 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -0700815 /* Set correct CWD for getcwd(3). */
816 if (chdir("/"))
817 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800818
819 return 0;
820}
821
Lee Campbell11af0622014-05-22 12:36:04 -0700822int mount_tmp(void)
823{
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -0800824 return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
Lee Campbell11af0622014-05-22 12:36:04 -0700825}
826
Dylan Reid791f5772015-09-14 20:02:42 -0700827int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400828{
829 const char *kProcPath = "/proc";
830 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -0500831 /*
832 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -0400833 * /proc in our namespace, which means using MS_REMOUNT here would
834 * mutate our parent's mount as well, even though we're in a VFS
835 * namespace (!). Instead, remove their mount from our namespace
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800836 * and make our own. However, if we are in a new user namespace, /proc
837 * is not seen as mounted, so don't return error if umount() fails.
Elly Jonese1749eb2011-10-07 13:54:59 -0400838 */
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -0700839 if (umount2(kProcPath, MNT_DETACH) && !j->flags.userns)
Elly Jonese1749eb2011-10-07 13:54:59 -0400840 return -errno;
841 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
842 return -errno;
843 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400844}
845
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800846static void write_pid_file(const struct minijail *j)
847{
848 FILE *fp = fopen(j->pid_file_path, "w");
849
850 if (!fp)
851 pdie("failed to open '%s'", j->pid_file_path);
852 if (fprintf(fp, "%d\n", (int)j->initpid) < 0)
853 pdie("fprintf(%s)", j->pid_file_path);
854 if (fclose(fp))
855 pdie("fclose(%s)", j->pid_file_path);
856}
857
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700858void drop_ugid(const struct minijail *j)
859{
860 if (j->flags.usergroups) {
861 if (initgroups(j->user, j->usergid))
862 pdie("initgroups");
863 } else {
864 /* Only attempt to clear supplemental groups if we are changing
865 * users. */
866 if ((j->uid || j->gid) && setgroups(0, NULL))
867 pdie("setgroups");
868 }
869
870 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
871 pdie("setresgid");
872
873 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
874 pdie("setresuid");
875}
876
Mike Frysinger3adfef72013-05-09 17:19:08 -0400877/*
878 * We specifically do not use cap_valid() as that only tells us the last
879 * valid cap we were *compiled* against (i.e. what the version of kernel
880 * headers says). If we run on a different kernel version, then it's not
881 * uncommon for that to be less (if an older kernel) or more (if a newer
882 * kernel). So suck up the answer via /proc.
883 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700884static unsigned int get_last_valid_cap()
Mike Frysinger3adfef72013-05-09 17:19:08 -0400885{
Dylan Reidf682d472015-09-17 21:39:07 -0700886 const char cap_file[] = "/proc/sys/kernel/cap_last_cap";
887 FILE *fp = fopen(cap_file, "re");
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700888 unsigned int last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -0400889
Dylan Reidf682d472015-09-17 21:39:07 -0700890 if (fscanf(fp, "%u", &last_valid_cap) != 1)
891 pdie("fscanf(%s)", cap_file);
892 fclose(fp);
Mike Frysinger3adfef72013-05-09 17:19:08 -0400893
Dylan Reidf682d472015-09-17 21:39:07 -0700894 return last_valid_cap;
Mike Frysinger3adfef72013-05-09 17:19:08 -0400895}
896
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700897void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -0400898{
899 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -0800900 cap_value_t flag[1];
Kees Cooke5609ac2013-02-06 14:12:41 -0800901 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400902 unsigned int i;
903 if (!caps)
904 die("can't get process caps");
905 if (cap_clear_flag(caps, CAP_INHERITABLE))
906 die("can't clear inheritable caps");
907 if (cap_clear_flag(caps, CAP_EFFECTIVE))
908 die("can't clear effective caps");
909 if (cap_clear_flag(caps, CAP_PERMITTED))
910 die("can't clear permitted caps");
Dylan Reidf682d472015-09-17 21:39:07 -0700911 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -0800912 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800913 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -0400914 continue;
Kees Cook323878a2013-02-05 15:35:24 -0800915 flag[0] = i;
916 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400917 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -0800918 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400919 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -0800920 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -0400921 die("can't add inheritable cap");
922 }
923 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -0800924 die("can't apply initial cleaned capset");
925
926 /*
927 * Instead of dropping bounding set first, do it here in case
928 * the caller had a more permissive bounding set which could
929 * have been used above to raise a capability that wasn't already
930 * present. This requires CAP_SETPCAP, so we raised/kept it above.
931 */
Dylan Reidf682d472015-09-17 21:39:07 -0700932 for (i = 0; i < sizeof(j->caps) * 8 && i <= last_valid_cap; ++i) {
Kees Cooke5609ac2013-02-06 14:12:41 -0800933 if (j->caps & (one << i))
Elly Jonese1749eb2011-10-07 13:54:59 -0400934 continue;
935 if (prctl(PR_CAPBSET_DROP, i))
936 pdie("prctl(PR_CAPBSET_DROP)");
937 }
Kees Cook323878a2013-02-05 15:35:24 -0800938
939 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -0800940 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -0800941 flag[0] = CAP_SETPCAP;
942 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
943 die("can't clear effective cap");
944 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
945 die("can't clear permitted cap");
946 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
947 die("can't clear inheritable cap");
948 }
949
950 if (cap_set_proc(caps))
951 die("can't apply final cleaned capset");
952
953 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -0400954}
955
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700956void set_seccomp_filter(const struct minijail *j)
957{
958 /*
959 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
960 * in the kernel source tree for an explanation of the parameters.
961 */
962 if (j->flags.no_new_privs) {
963 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
964 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
965 }
966
967 /*
968 * If we're logging seccomp filter failures,
969 * install the SIGSYS handler first.
970 */
971 if (j->flags.seccomp_filter && j->flags.log_seccomp_filter) {
972 if (install_sigsys_handler())
973 pdie("install SIGSYS handler");
974 warn("logging seccomp filter failures");
975 }
976
977 /*
978 * Install the syscall filter.
979 */
980 if (j->flags.seccomp_filter) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700981 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
982 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
983 warn("seccomp not supported");
984 return;
985 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700986 pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -0700987 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -0700988 }
989}
990
Will Drewry6ac91122011-10-21 16:38:58 -0500991void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400992{
Dylan Reidf682d472015-09-17 21:39:07 -0700993 /*
994 * Get the last valid cap from /proc, since /proc can be unmounted
995 * before drop_caps().
996 */
Jorge Lucangeli Obes20342742015-10-27 11:39:59 -0700997 unsigned int last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -0700998
Elly Jonese1749eb2011-10-07 13:54:59 -0400999 if (j->flags.pids)
1000 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001001 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04001002
Elly Jonese1749eb2011-10-07 13:54:59 -04001003 if (j->flags.usergroups && !j->user)
1004 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -04001005
Elly Jonesdd3e8512012-01-23 15:13:38 -05001006 /*
1007 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04001008 * so we don't even try. If any of our operations fail, we abort() the
1009 * entire process.
1010 */
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07001011 if (j->flags.enter_vfs && setns(j->mountns_fd, CLONE_NEWNS))
1012 pdie("setns(CLONE_NEWNS)");
1013
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07001014 if (j->flags.vfs) {
1015 if (unshare(CLONE_NEWNS))
1016 pdie("unshare(vfs)");
1017 /*
1018 * Remount all filesystems as private. If they are shared
1019 * new bind mounts will creep out of our namespace.
1020 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
1021 */
1022 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
1023 pdie("mount(/, private)");
1024 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001025
Dylan Reid1102f5a2015-09-15 11:52:20 -07001026 if (j->flags.enter_net) {
1027 if (setns(j->netns_fd, CLONE_NEWNET))
1028 pdie("setns(CLONE_NEWNET)");
1029 } else if (j->flags.net && unshare(CLONE_NEWNET)) {
Elly Fong-Jones6c086302013-03-20 17:15:28 -04001030 pdie("unshare(net)");
Dylan Reid1102f5a2015-09-15 11:52:20 -07001031 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001032
Elly Jones51a5b6c2011-10-12 19:09:26 -04001033 if (j->flags.chroot && enter_chroot(j))
1034 pdie("chroot");
1035
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08001036 if (j->flags.pivot_root && enter_pivot_root(j))
1037 pdie("pivot_root");
1038
Jorge Lucangeli Obes3901da62015-03-03 13:55:11 -08001039 if (j->flags.mount_tmp && mount_tmp())
Lee Campbell11af0622014-05-22 12:36:04 -07001040 pdie("mount_tmp");
1041
Dylan Reid791f5772015-09-14 20:02:42 -07001042 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04001043 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04001044
Elly Jonese1749eb2011-10-07 13:54:59 -04001045 if (j->flags.caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001046 /*
1047 * POSIX capabilities are a bit tricky. If we drop our
Elly Jonese1749eb2011-10-07 13:54:59 -04001048 * capability to change uids, our attempt to use setuid()
1049 * below will fail. Hang on to root caps across setuid(), then
1050 * lock securebits.
1051 */
1052 if (prctl(PR_SET_KEEPCAPS, 1))
1053 pdie("prctl(PR_SET_KEEPCAPS)");
1054 if (prctl
1055 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
1056 pdie("prctl(PR_SET_SECUREBITS)");
1057 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001058
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001059 /*
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001060 * If we're setting no_new_privs, we can drop privileges
1061 * before setting seccomp filter. This way filter policies
1062 * don't need to allow privilege-dropping syscalls.
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001063 */
1064 if (j->flags.no_new_privs) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001065 drop_ugid(j);
1066 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001067 drop_caps(j, last_valid_cap);
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001068
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001069 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04001070 } else {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07001071 /*
1072 * If we're not setting no_new_privs,
1073 * we need to set seccomp filter *before* dropping privileges.
1074 * WARNING: this means that filter policies *must* allow
1075 * setgroups()/setresgid()/setresuid() for dropping root and
1076 * capget()/capset()/prctl() for dropping caps.
1077 */
1078 set_seccomp_filter(j);
1079
1080 drop_ugid(j);
1081 if (j->flags.caps)
Dylan Reidf682d472015-09-17 21:39:07 -07001082 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04001083 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001084
Elly Jonesdd3e8512012-01-23 15:13:38 -05001085 /*
1086 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04001087 * privilege-dropping syscalls :)
1088 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001089 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
1090 if ((errno == ENOSYS) && SECCOMP_SOFTFAIL) {
1091 warn("seccomp not supported");
1092 return;
1093 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001094 pdie("prctl(PR_SET_SECCOMP)");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001095 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001096}
1097
Will Drewry6ac91122011-10-21 16:38:58 -05001098/* TODO(wad) will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04001099static int init_exitstatus = 0;
1100
Will Drewry6ac91122011-10-21 16:38:58 -05001101void init_term(int __attribute__ ((unused)) sig)
Elly Jonese1749eb2011-10-07 13:54:59 -04001102{
1103 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04001104}
1105
Will Drewry6ac91122011-10-21 16:38:58 -05001106int init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04001107{
1108 pid_t pid;
1109 int status;
1110 /* so that we exit with the right status */
1111 signal(SIGTERM, init_term);
1112 /* TODO(wad) self jail with seccomp_filters here. */
1113 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001114 /*
1115 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04001116 * left inside our pid namespace or we get a signal.
1117 */
1118 if (pid == rootpid)
1119 init_exitstatus = status;
1120 }
1121 if (!WIFEXITED(init_exitstatus))
1122 _exit(MINIJAIL_ERR_INIT);
1123 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04001124}
1125
Will Drewry6ac91122011-10-21 16:38:58 -05001126int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001127{
1128 size_t sz = 0;
1129 size_t bytes = read(fd, &sz, sizeof(sz));
1130 char *buf;
1131 int r;
1132 if (sizeof(sz) != bytes)
1133 return -EINVAL;
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001134 if (sz > USHRT_MAX) /* arbitrary sanity check */
Elly Jonese1749eb2011-10-07 13:54:59 -04001135 return -E2BIG;
1136 buf = malloc(sz);
1137 if (!buf)
1138 return -ENOMEM;
1139 bytes = read(fd, buf, sz);
1140 if (bytes != sz) {
1141 free(buf);
1142 return -EINVAL;
1143 }
1144 r = minijail_unmarshal(j, buf, sz);
1145 free(buf);
1146 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001147}
1148
Will Drewry6ac91122011-10-21 16:38:58 -05001149int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04001150{
1151 char *buf;
1152 size_t sz = minijail_size(j);
1153 ssize_t written;
1154 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -04001155
Elly Jonese1749eb2011-10-07 13:54:59 -04001156 if (!sz)
1157 return -EINVAL;
1158 buf = malloc(sz);
1159 r = minijail_marshal(j, buf, sz);
1160 if (r) {
1161 free(buf);
1162 return r;
1163 }
1164 /* Sends [size][minijail]. */
1165 written = write(fd, &sz, sizeof(sz));
1166 if (written != sizeof(sz)) {
1167 free(buf);
1168 return -EFAULT;
1169 }
1170 written = write(fd, buf, sz);
1171 if (written < 0 || (size_t) written != sz) {
1172 free(buf);
1173 return -EFAULT;
1174 }
1175 free(buf);
1176 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -05001177}
Elly Jonescd7a9042011-07-22 13:56:51 -04001178
Will Drewry6ac91122011-10-21 16:38:58 -05001179int setup_preload(void)
Elly Jonese1749eb2011-10-07 13:54:59 -04001180{
Daniel Erat5b7a3182015-08-19 16:06:22 -06001181#if defined(__ANDROID__)
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001182 /* Don't use LDPRELOAD on Brillo. */
1183 return 0;
1184#else
Elly Jonese1749eb2011-10-07 13:54:59 -04001185 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
1186 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
1187 if (!newenv)
1188 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -04001189
Elly Jonese1749eb2011-10-07 13:54:59 -04001190 /* Only insert a separating space if we have something to separate... */
1191 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
1192 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -04001193
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001194 /* setenv() makes a copy of the string we give it. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001195 setenv(kLdPreloadEnvVar, newenv, 1);
1196 free(newenv);
1197 return 0;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07001198#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04001199}
1200
Will Drewry6ac91122011-10-21 16:38:58 -05001201int setup_pipe(int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04001202{
1203 int r = pipe(fds);
1204 char fd_buf[11];
1205 if (r)
1206 return r;
1207 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
1208 if (r <= 0)
1209 return -EINVAL;
1210 setenv(kFdEnvVar, fd_buf, 1);
1211 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001212}
1213
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001214int setup_pipe_end(int fds[2], size_t index)
1215{
1216 if (index > 1)
1217 return -1;
1218
1219 close(fds[1 - index]);
1220 return fds[index];
1221}
1222
1223int setup_and_dupe_pipe_end(int fds[2], size_t index, int fd)
1224{
1225 if (index > 1)
1226 return -1;
1227
1228 close(fds[1 - index]);
1229 /* dup2(2) the corresponding end of the pipe into |fd|. */
1230 return dup2(fds[index], fd);
1231}
1232
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001233int minijail_run_internal(struct minijail *j, const char *filename,
1234 char *const argv[], pid_t *pchild_pid,
1235 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1236 int use_preload);
1237
Will Drewry6ac91122011-10-21 16:38:58 -05001238int API minijail_run(struct minijail *j, const char *filename,
1239 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04001240{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001241 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1242 true);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001243}
1244
1245int API minijail_run_pid(struct minijail *j, const char *filename,
1246 char *const argv[], pid_t *pchild_pid)
1247{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001248 return minijail_run_internal(j, filename, argv, pchild_pid,
1249 NULL, NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001250}
1251
1252int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07001253 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001254{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001255 return minijail_run_internal(j, filename, argv, NULL, pstdin_fd,
1256 NULL, NULL, true);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001257}
1258
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001259int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07001260 char *const argv[], pid_t *pchild_pid,
1261 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001262{
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001263 return minijail_run_internal(j, filename, argv, pchild_pid,
1264 pstdin_fd, pstdout_fd, pstderr_fd, true);
1265}
1266
1267int API minijail_run_no_preload(struct minijail *j, const char *filename,
1268 char *const argv[])
1269{
1270 return minijail_run_internal(j, filename, argv, NULL, NULL, NULL, NULL,
1271 false);
1272}
1273
Samuel Tan63187f42015-10-16 13:01:53 -07001274int API minijail_run_pid_pipes_no_preload(struct minijail *j,
1275 const char *filename, char *const argv[],
1276 pid_t *pchild_pid,
1277 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd) {
1278 return minijail_run_internal(j, filename, argv, pchild_pid,
1279 pstdin_fd, pstdout_fd, pstderr_fd, false);
1280}
1281
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001282int minijail_run_internal(struct minijail *j, const char *filename,
1283 char *const argv[], pid_t *pchild_pid,
1284 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd,
1285 int use_preload)
1286{
Elly Jonese1749eb2011-10-07 13:54:59 -04001287 char *oldenv, *oldenv_copy = NULL;
1288 pid_t child_pid;
1289 int pipe_fds[2];
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001290 int stdin_fds[2];
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001291 int stdout_fds[2];
1292 int stderr_fds[2];
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001293 int userns_pipe_fds[2];
Elly Jonese1749eb2011-10-07 13:54:59 -04001294 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04001295 /* We need to remember this across the minijail_preexec() call. */
1296 int pid_namespace = j->flags.pids;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001297 int do_init = j->flags.do_init;
Ben Chan541c7e52011-08-26 14:55:53 -07001298
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001299 if (use_preload) {
1300 oldenv = getenv(kLdPreloadEnvVar);
1301 if (oldenv) {
1302 oldenv_copy = strdup(oldenv);
1303 if (!oldenv_copy)
1304 return -ENOMEM;
1305 }
1306
1307 if (setup_preload())
1308 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04001309 }
Will Drewryf89aef52011-09-16 16:48:57 -05001310
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001311 if (!use_preload) {
1312 if (j->flags.caps)
1313 die("Capabilities are not supported without "
1314 "LD_PRELOAD");
1315 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05001316
Elly Jonesdd3e8512012-01-23 15:13:38 -05001317 /*
Jorge Lucangeli Obes3c84df12015-05-14 17:37:58 -07001318 * Make the process group ID of this process equal to its PID, so that
1319 * both the Minijail process and the jailed process can be killed
1320 * together.
1321 * Don't fail on EPERM, since setpgid(0, 0) can only EPERM when
1322 * the process is already a process group leader.
1323 */
1324 if (setpgid(0 /* use calling PID */, 0 /* make PGID = PID */)) {
1325 if (errno != EPERM) {
1326 pdie("setpgid(0, 0)");
1327 }
1328 }
1329
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001330 if (use_preload) {
1331 /*
1332 * Before we fork(2) and execve(2) the child process, we need
1333 * to open a pipe(2) to send the minijail configuration over.
1334 */
1335 if (setup_pipe(pipe_fds))
1336 return -EFAULT;
1337 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001338
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001339 /*
1340 * If we want to write to the child process' standard input,
1341 * create the pipe(2) now.
1342 */
1343 if (pstdin_fd) {
1344 if (pipe(stdin_fds))
1345 return -EFAULT;
1346 }
1347
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001348 /*
1349 * If we want to read from the child process' standard output,
1350 * create the pipe(2) now.
1351 */
1352 if (pstdout_fd) {
1353 if (pipe(stdout_fds))
1354 return -EFAULT;
1355 }
1356
1357 /*
1358 * If we want to read from the child process' standard error,
1359 * create the pipe(2) now.
1360 */
1361 if (pstderr_fd) {
1362 if (pipe(stderr_fds))
1363 return -EFAULT;
1364 }
1365
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001366 /*
1367 * If we want to set up a new uid/gid mapping in the user namespace,
1368 * create the pipe(2) to sync between parent and child.
1369 */
1370 if (j->flags.userns) {
1371 if (pipe(userns_pipe_fds))
1372 return -EFAULT;
1373 }
1374
Elly Jones761b7412012-06-13 15:49:52 -04001375 /* Use sys_clone() if and only if we're creating a pid namespace.
1376 *
1377 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
1378 *
1379 * In multithreaded programs, there are a bunch of locks inside libc,
1380 * some of which may be held by other threads at the time that we call
1381 * minijail_run_pid(). If we call fork(), glibc does its level best to
1382 * ensure that we hold all of these locks before it calls clone()
1383 * internally and drop them after clone() returns, but when we call
1384 * sys_clone(2) directly, all that gets bypassed and we end up with a
1385 * child address space where some of libc's important locks are held by
1386 * other threads (which did not get cloned, and hence will never release
1387 * those locks). This is okay so long as we call exec() immediately
1388 * after, but a bunch of seemingly-innocent libc functions like setenv()
1389 * take locks.
1390 *
1391 * Hence, only call sys_clone() if we need to, in order to get at pid
1392 * namespacing. If we follow this path, the child's address space might
1393 * have broken locks; you may only call functions that do not acquire
1394 * any locks.
1395 *
1396 * Unfortunately, fork() acquires every lock it can get its hands on, as
1397 * previously detailed, so this function is highly likely to deadlock
1398 * later on (see "deadlock here") if we're multithreaded.
1399 *
1400 * We might hack around this by having the clone()d child (init of the
1401 * pid namespace) return directly, rather than leaving the clone()d
1402 * process hanging around to be init for the new namespace (and having
1403 * its fork()ed child return in turn), but that process would be crippled
1404 * with its libc locks potentially broken. We might try fork()ing in the
1405 * parent before we clone() to ensure that we own all the locks, but
1406 * then we have to have the forked child hanging around consuming
1407 * resources (and possibly having file descriptors / shared memory
1408 * regions / etc attached). We'd need to keep the child around to avoid
1409 * having its children get reparented to init.
1410 *
1411 * TODO(ellyjones): figure out if the "forked child hanging around"
1412 * problem is fixable or not. It would be nice if we worked in this
1413 * case.
1414 */
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001415 if (pid_namespace) {
1416 int clone_flags = CLONE_NEWPID | SIGCHLD;
1417 if (j->flags.userns)
1418 clone_flags |= CLONE_NEWUSER;
1419 child_pid = syscall(SYS_clone, clone_flags, NULL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001420 } else {
Elly Jones761b7412012-06-13 15:49:52 -04001421 child_pid = fork();
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001422 }
Elly Jones761b7412012-06-13 15:49:52 -04001423
Elly Jonese1749eb2011-10-07 13:54:59 -04001424 if (child_pid < 0) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001425 if (use_preload) {
1426 free(oldenv_copy);
1427 }
Lee Campbell1e4fc6a2014-06-06 17:40:02 -07001428 die("failed to fork child");
Elly Jonese1749eb2011-10-07 13:54:59 -04001429 }
Will Drewryf89aef52011-09-16 16:48:57 -05001430
Elly Jonese1749eb2011-10-07 13:54:59 -04001431 if (child_pid) {
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001432 if (use_preload) {
1433 /* Restore parent's LD_PRELOAD. */
1434 if (oldenv_copy) {
1435 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
1436 free(oldenv_copy);
1437 } else {
1438 unsetenv(kLdPreloadEnvVar);
1439 }
1440 unsetenv(kFdEnvVar);
Elly Jonese1749eb2011-10-07 13:54:59 -04001441 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001442
Elly Jonese1749eb2011-10-07 13:54:59 -04001443 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001444
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08001445 if (j->flags.pid_file)
1446 write_pid_file(j);
1447
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001448 if (j->flags.userns)
1449 write_ugid_mappings(j, userns_pipe_fds);
1450
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001451 if (use_preload) {
1452 /* Send marshalled minijail. */
1453 close(pipe_fds[0]); /* read endpoint */
1454 ret = minijail_to_fd(j, pipe_fds[1]);
1455 close(pipe_fds[1]); /* write endpoint */
1456 if (ret) {
1457 kill(j->initpid, SIGKILL);
1458 die("failed to send marshalled minijail");
1459 }
Elly Jonese1749eb2011-10-07 13:54:59 -04001460 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001461
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07001462 if (pchild_pid)
1463 *pchild_pid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001464
1465 /*
1466 * If we want to write to the child process' standard input,
1467 * set up the write end of the pipe.
1468 */
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001469 if (pstdin_fd)
1470 *pstdin_fd = setup_pipe_end(stdin_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001471 1 /* write end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001472
1473 /*
1474 * If we want to read from the child process' standard output,
1475 * set up the read end of the pipe.
1476 */
1477 if (pstdout_fd)
1478 *pstdout_fd = setup_pipe_end(stdout_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001479 0 /* read end */);
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001480
1481 /*
1482 * If we want to read from the child process' standard error,
1483 * set up the read end of the pipe.
1484 */
1485 if (pstderr_fd)
1486 *pstderr_fd = setup_pipe_end(stderr_fds,
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001487 0 /* read end */);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001488
Elly Jonese1749eb2011-10-07 13:54:59 -04001489 return 0;
1490 }
1491 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -07001492
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08001493 if (j->flags.userns)
1494 enter_user_namespace(j, userns_pipe_fds);
1495
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001496 /*
1497 * If we want to write to the jailed process' standard input,
1498 * set up the read end of the pipe.
1499 */
1500 if (pstdin_fd) {
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001501 if (setup_and_dupe_pipe_end(stdin_fds, 0 /* read end */,
1502 STDIN_FILENO) < 0)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07001503 die("failed to set up stdin pipe");
1504 }
1505
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08001506 /*
1507 * If we want to read from the jailed process' standard output,
1508 * set up the write end of the pipe.
1509 */
1510 if (pstdout_fd) {
1511 if (setup_and_dupe_pipe_end(stdout_fds, 1 /* write end */,
1512 STDOUT_FILENO) < 0)
1513 die("failed to set up stdout pipe");
1514 }
1515
1516 /*
1517 * If we want to read from the jailed process' standard error,
1518 * set up the write end of the pipe.
1519 */
1520 if (pstderr_fd) {
1521 if (setup_and_dupe_pipe_end(stderr_fds, 1 /* write end */,
1522 STDERR_FILENO) < 0)
1523 die("failed to set up stderr pipe");
1524 }
1525
Dylan Reid791f5772015-09-14 20:02:42 -07001526 /* If running an init program, let it decide when/how to mount /proc. */
1527 if (pid_namespace && !do_init)
1528 j->flags.remount_proc_ro = 0;
1529
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001530 if (use_preload) {
1531 /* Strip out flags that cannot be inherited across execve(2). */
1532 minijail_preexec(j);
1533 } else {
1534 j->flags.pids = 0;
1535 }
1536 /* Jail this process, then execve() the target. */
Elly Jonese1749eb2011-10-07 13:54:59 -04001537 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001538
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001539 if (pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05001540 /*
1541 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001542 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001543 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08001544 * a child to actually run the program. If |do_init == 0|, we
1545 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04001546 *
1547 * If we're multithreaded, we'll probably deadlock here. See
1548 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04001549 */
1550 child_pid = fork();
1551 if (child_pid < 0)
1552 _exit(child_pid);
1553 else if (child_pid > 0)
1554 init(child_pid); /* never returns */
1555 }
Elly Jonescd7a9042011-07-22 13:56:51 -04001556
Elly Jonesdd3e8512012-01-23 15:13:38 -05001557 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07001558 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04001559 * calling process
1560 * -> execve()-ing process
1561 * If we are:
1562 * calling process
1563 * -> init()-ing process
1564 * -> execve()-ing process
1565 */
1566 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -04001567}
1568
Will Drewry6ac91122011-10-21 16:38:58 -05001569int API minijail_kill(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001570{
1571 int st;
1572 if (kill(j->initpid, SIGTERM))
1573 return -errno;
1574 if (waitpid(j->initpid, &st, 0) < 0)
1575 return -errno;
1576 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -04001577}
1578
Will Drewry6ac91122011-10-21 16:38:58 -05001579int API minijail_wait(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001580{
1581 int st;
1582 if (waitpid(j->initpid, &st, 0) < 0)
1583 return -errno;
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001584
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001585 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001586 int error_status = st;
1587 if (WIFSIGNALED(st)) {
1588 int signum = WTERMSIG(st);
mukesh agrawalc420a262013-06-11 17:22:42 -07001589 warn("child process %d received signal %d",
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07001590 j->initpid, signum);
1591 /*
1592 * We return MINIJAIL_ERR_JAIL if the process received
1593 * SIGSYS, which happens when a syscall is blocked by
1594 * seccomp filters.
1595 * If not, we do what bash(1) does:
1596 * $? = 128 + signum
1597 */
1598 if (signum == SIGSYS) {
1599 error_status = MINIJAIL_ERR_JAIL;
1600 } else {
1601 error_status = 128 + signum;
1602 }
1603 }
1604 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07001605 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001606
1607 int exit_status = WEXITSTATUS(st);
1608 if (exit_status != 0)
mukesh agrawalc420a262013-06-11 17:22:42 -07001609 info("child process %d exited with status %d",
1610 j->initpid, exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08001611
1612 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04001613}
1614
Will Drewry6ac91122011-10-21 16:38:58 -05001615void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001616{
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001617 if (j->flags.seccomp_filter && j->filter_prog) {
1618 free(j->filter_prog->filter);
1619 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04001620 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001621 while (j->bindings_head) {
1622 struct binding *b = j->bindings_head;
1623 j->bindings_head = j->bindings_head->next;
1624 free(b->dest);
1625 free(b->src);
1626 free(b);
1627 }
1628 j->bindings_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001629 if (j->user)
1630 free(j->user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001631 if (j->chrootdir)
1632 free(j->chrootdir);
Elly Jonese1749eb2011-10-07 13:54:59 -04001633 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001634}