Add an option to skip remounting / as MS_PRIVATE.
am: a521bee6c8
* commit 'a521bee6c8c014aa19cbfea0b365ba984277aa27':
Add an option to skip remounting / as MS_PRIVATE.
diff --git a/libminijail.c b/libminijail.c
index bcbaed5..118e61f 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -96,6 +96,7 @@
int capbset_drop:1;
int vfs:1;
int enter_vfs:1;
+ int skip_remount_private:1;
int pids:1;
int ipc:1;
int net:1;
@@ -398,6 +399,11 @@
j->flags.enter_vfs = 1;
}
+void API minijail_skip_remount_private(struct minijail *j)
+{
+ j->flags.skip_remount_private = 1;
+}
+
void API minijail_namespace_pids(struct minijail *j)
{
j->flags.vfs = 1;
@@ -682,16 +688,15 @@
char *buf;
};
-void marshal_state_init(struct marshal_state *state,
- char *buf, size_t available)
+void marshal_state_init(struct marshal_state *state, char *buf,
+ size_t available)
{
state->available = available;
state->buf = buf;
state->total = 0;
}
-void marshal_append(struct marshal_state *state,
- void *src, size_t length)
+void marshal_append(struct marshal_state *state, void *src, size_t length)
{
size_t copy_len = MIN(state->available, length);
@@ -727,7 +732,7 @@
if (j->flags.seccomp_filter && j->filter_prog) {
struct sock_fprog *fp = j->filter_prog;
marshal_append(state, (char *)fp->filter,
- fp->len * sizeof(struct sock_filter));
+ fp->len * sizeof(struct sock_filter));
}
for (m = j->mounts_head; m; m = m->next) {
marshal_append(state, m->src, strlen(m->src) + 1);
@@ -1372,12 +1377,15 @@
if (unshare(CLONE_NEWNS))
pdie("unshare(vfs)");
/*
- * Remount all filesystems as private. If they are shared
- * new bind mounts will creep out of our namespace.
+ * Unless asked not to, remount all filesystems as private.
+ * If they are shared, new bind mounts will creep out of our
+ * namespace.
* https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
*/
- if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
- pdie("mount(/, private)");
+ if (!j->flags.skip_remount_private) {
+ if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
+ pdie("mount(/, private)");
+ }
}
if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
@@ -1650,7 +1658,8 @@
char *const argv[],
pid_t *pchild_pid,
int *pstdin_fd, int *pstdout_fd,
- int *pstderr_fd) {
+ int *pstderr_fd)
+{
return minijail_run_internal(j, filename, argv, pchild_pid,
pstdin_fd, pstdout_fd, pstderr_fd, false);
}
@@ -1780,13 +1789,13 @@
* We might hack around this by having the clone()d child (init of the
* pid namespace) return directly, rather than leaving the clone()d
* process hanging around to be init for the new namespace (and having
- * its fork()ed child return in turn), but that process would be crippled
- * with its libc locks potentially broken. We might try fork()ing in the
- * parent before we clone() to ensure that we own all the locks, but
- * then we have to have the forked child hanging around consuming
- * resources (and possibly having file descriptors / shared memory
- * regions / etc attached). We'd need to keep the child around to avoid
- * having its children get reparented to init.
+ * its fork()ed child return in turn), but that process would be
+ * crippled with its libc locks potentially broken. We might try
+ * fork()ing in the parent before we clone() to ensure that we own all
+ * the locks, but then we have to have the forked child hanging around
+ * consuming resources (and possibly having file descriptors / shared
+ * memory regions / etc attached). We'd need to keep the child around to
+ * avoid having its children get reparented to init.
*
* TODO(ellyjones): figure out if the "forked child hanging around"
* problem is fixable or not. It would be nice if we worked in this
diff --git a/libminijail.h b/libminijail.h
index 8bd8b39..49f7786 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -58,6 +58,11 @@
void minijail_reset_signal_mask(struct minijail *j);
void minijail_namespace_vfs(struct minijail *j);
void minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path);
+/*
+ * This option is *dangerous* as it negates most of the functionality of
+ * minijail_namespace_vfs(). You very likely don't need this.
+ */
+void minijail_skip_remount_private(struct minijail *j);
void minijail_namespace_ipc(struct minijail *j);
void minijail_namespace_net(struct minijail *j);
void minijail_namespace_enter_net(struct minijail *j, const char *ns_path);
diff --git a/minijail0.1 b/minijail0.1
index ae53ce0..8d7e188 100644
--- a/minijail0.1
+++ b/minijail0.1
@@ -1,4 +1,4 @@
-.TH MINIJAIL0 "1" "January 2012" "Chromium OS" "User Commands"
+.TH MINIJAIL0 "1" "March 2016" "Chromium OS" "User Commands"
.SH NAME
minijail0 \- sandbox a process
.SH SYNOPSIS
@@ -9,11 +9,11 @@
Runs PROGRAM inside a sandbox.
.TP
\fB-a <table>\fR
-Run using the alternate syscall table named <table>. Only available on kernels
+Run using the alternate syscall table named \fItable\fR. Only available on kernels
and architectures that support the PR_ALT_SYSCALL option of prctl(2).
.TP
\fB-b <src>,<dest>[,<writeable>]
-Bind-mount <src> into the chroot directory at <dest>, optionally writeable.
+Bind-mount \fIsrc\fR into the chroot directory at \fIdest\fR, optionally writeable.
.TP
\fB-c <caps>\fR
Restrict capabilities to \fIcaps\fR. When used in conjunction with \fB-u\fR and
@@ -24,21 +24,16 @@
\fBcapabilities\fR(7).
.TP
\fB-C <dir>\fR
-Change root (using chroot(2)) to <dir>.
+Change root (using chroot(2)) to \fIdir\fR.
.TP
\fB-e[file]\fR
-Enter a new network namespace, or if \fIfile\fR is specified, Enter an existing
+Enter a new network namespace, or if \fIfile\fR is specified, enter an existing
network namespace specified by \fIfile\fR which is typically of the form
/proc/<pid>/ns/net.
.TP
\fB-f <file>\fR
Write the pid of the jailed process to \fIfile\fR.
.TP
-\fB-t\fR
-Mounts a tmpfs filesystem on /tmp. /tmp must exist in the chroot.
-This must be used with -C. The default filesystem has a max size of 128M
-and has standard /tmp permissions (777).
-.TP
\fB-G\fR
Inherit all the supplementary groups of the user specified with \fB-u\fR. It
is an error to use this option without having specified a \fBuser name\fR to
@@ -56,17 +51,25 @@
(Other direct numbers may be specified if minijail0 is not in sync with the
host kernel or something like 32/64-bit compatibility issues exist.)
.TP
+\fB-k <src>,<dest>,<type>[,<flags>]\fR
+Mount \fIsrc\fR, a \fItype\fR filesystem, into the chroot directory at \fIdest\fR, with optional \fIflags\fR.
+.TP
+\fB-K\fR
+Don't mark all existing mounts as MS_PRIVATE.
+This option is \fBdangerous\fR as it negates most of the functionality of \fB-v\fR.
+You very likely don't need this.
+.TP
\fB-l\fR
Run inside a new IPC namespace. This option makes the program's System V IPC
namespace independent.
.TP
\fB-m "<uid> <loweruid> <count>[,<uid> <loweruid> <count>]"\fR
-Set the uid mapping of a user namespace (implies \fB-pU\fR). Same arguments as
-\fBnewuidmap(1)\fR. Multiple mappings should be separated by ','.
+Set the uid mapping of a user namespace (implies \fB-pU\fR). Same arguments as
+\fBnewuidmap(1)\fR. Multiple mappings should be separated by ','.
.TP
\fB-M "<uid> <loweruid> <count>[,<uid> <loweruid> <count>]"\fR
-Set the gid mapping of a user namespace (implies \fB-pU\fR). Same arguments as
-\fBnewgidmap(1)\fR. Multiple mappings should be separated by ','.
+Set the gid mapping of a user namespace (implies \fB-pU\fR). Same arguments as
+\fBnewgidmap(1)\fR. Multiple mappings should be separated by ','.
.TP
\fB-p\fR
Run inside a new PID namespace. This option will make it impossible for the
@@ -89,9 +92,14 @@
.TP
\fB-S <arch-specific seccomp_filter policy file>\fR
Enable seccomp(2) in mode 13 which restricts the child process to a set of
-system calls defined in the policy file. Note that system calls often change
+system calls defined in the policy file. Note that system calls often change
names based on the architecture or mode. (uname -m is your friend.)
.TP
+\fB-t\fR
+Mounts a tmpfs filesystem on /tmp. /tmp must exist in the chroot.
+This must be used with \fB-C\fR. The default filesystem has a max size of 128M
+and has standard /tmp permissions (777).
+.TP
\fB-T <type>\fR
Assume program's ELF linkage type is \fItype\fR,
which should be either 'static' or 'dynamic'.
@@ -112,15 +120,16 @@
the process to which they will actually apply - specifically capability use
(since capabilities are not inherited to an exec'd process unless the exec'd
process has POSIX file capabilities), seccomp (since we can't exec() once we're
-seccomp'd), and ptrace-disable (which is always cleared on exec().
+seccomp'd), and ptrace-disable (which is always cleared on exec()).
To this end, \fBlibminijailpreload\fR is forcibly loaded into all
dynamically-linked target programs if any of these restrictions are in effect;
we pass the specific restrictions in an environment variable which the preloaded
library looks for. The forcibly-loaded library then applies the restrictions
to the newly-loaded program.
+
.SH AUTHOR
-Written by Elly Jones (ellyjones@chromium.org)
+The Chromium OS Authors <chromiumos-dev@chromium.org>
.SH COPYRIGHT
Copyright \(co 2011 The Chromium OS Authors
License BSD-like.
diff --git a/minijail0.c b/minijail0.c
index 3d648e3..f3caeac 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -101,9 +101,9 @@
" [-M \"<gid> <lowergid> <count>[,<uid> <loweruid> <count>]\"]\n"
" <program> [args...]\n"
" -a <table>: Use alternate syscall table <table>.\n"
- " -b: Binds <src> to <dest> in chroot.\n"
+ " -b: Bind <src> to <dest> in chroot.\n"
" Multiple instances allowed.\n"
- " -k: Mount <src> to <dest> in chroot.\n"
+ " -k: Mount <src> at <dest> in chroot.\n"
" Multiple instances allowed, flags are passed to mount(2).\n"
" -c <caps>: Restrict caps to <caps>.\n"
" -C <dir>: chroot(2) to <dir>.\n"
@@ -117,6 +117,7 @@
" -i: Exit immediately after fork (do not act as init).\n"
" Not compatible with -p.\n"
" -I: Run <program> as init (pid 1) inside a new pid namespace (implies -p).\n"
+ " -K: Don't mark all existing mounts as MS_PRIVATE.\n"
" -l: Enter new IPC namespace.\n"
" -L: Report blocked syscalls to syslog when using seccomp filter.\n"
" Forces the following syscalls to be allowed:\n"
@@ -166,12 +167,13 @@
int use_seccomp_filter = 0;
int binding = 0;
int pivot_root = 0, chroot = 0;
+ int mount_ns = 0, skip_remount = 0;
const size_t path_max = 4096;
const char *filter_path;
if (argc > 1 && argv[1][0] != '-')
return 1;
while ((opt = getopt(argc, argv,
- "u:g:sS:c:C:P:b:V:f:m:M:k:a:e::T:vrGhHinplLtIU"))
+ "u:g:sS:c:C:P:b:V:f:m:M:k:a:e::T:vrGhHinplLtIUK"))
!= -1) {
switch (opt) {
case 'u':
@@ -228,6 +230,10 @@
case 'k':
add_mount(j, optarg);
break;
+ case 'K':
+ minijail_skip_remount_private(j);
+ skip_remount = 1;
+ break;
case 'P':
if (chroot) {
fprintf(stderr,
@@ -254,6 +260,7 @@
break;
case 'v':
minijail_namespace_vfs(j);
+ mount_ns = 1;
break;
case 'V':
minijail_namespace_enter_vfs(j, optarg);
@@ -337,6 +344,16 @@
}
/*
+ * Remounting / as MS_PRIVATE only happens when entering a new mount
+ * namespace, so skipping it only applies in that case.
+ */
+ if (skip_remount && !mount_ns) {
+ fprintf(stderr, "Can't skip marking mounts as MS_PRIVATE"
+ " without mount namespaces.\n");
+ exit(1);
+ }
+
+ /*
* We parse seccomp filters here to make sure we've collected all
* cmdline options.
*/