minijail: add user namespace support
Since most of the operations can be done if we have |euid = 0| in the
new user namespace, we enter a new user namespace and become root
immediately after fork()/clone().
It is incompatible with -b and <writable> set to 0, since we are not
able to remount bind mounts as readonly in a user namespace.
BUG=chromium:517387
TEST=security_Minijail0 pass
TEST=`minijail0 -m "0 1000 1" -M "0 1000 1" -- /usr/bin/touch t`
TEST=file `t` has owner:group root:root in minijail
TEST=and chronos:chronos outside minijail
Change-Id: I48f888097be5211715c5a839eca6f8e43b9903dd
Reviewed-on: https://chromium-review.googlesource.com/291200
Reviewed-by: Jorge Lucangeli Obes <jorgelo@chromium.org>
Tested-by: Yu-hsi Chiang <yuhsi@google.com>
Commit-Queue: Nicolas Boichat <drinkcat@chromium.org>
Trybot-Ready: Nicolas Boichat <drinkcat@chromium.org>
diff --git a/libminijail.c b/libminijail.c
index 2e625d6..b170db0 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -86,6 +86,7 @@
int enter_vfs:1;
int pids:1;
int net:1;
+ int userns:1;
int seccomp:1;
int readonly:1;
int usergroups:1;
@@ -107,6 +108,8 @@
int filter_len;
int binding_count;
char *chrootdir;
+ char *uidmap;
+ char *gidmap;
struct sock_fprog *filter_prog;
struct binding *bindings_head;
struct binding *bindings_tail;
@@ -135,6 +138,7 @@
int vfs = j->flags.vfs;
int enter_vfs = j->flags.enter_vfs;
int readonly = j->flags.readonly;
+ int userns = j->flags.userns;
if (j->user)
free(j->user);
j->user = NULL;
@@ -143,6 +147,7 @@
j->flags.vfs = vfs;
j->flags.enter_vfs = enter_vfs;
j->flags.readonly = readonly;
+ j->flags.userns = userns;
/* Note, |pids| will already have been used before this call. */
}
@@ -293,6 +298,27 @@
j->flags.readonly = 1;
}
+void API minijail_namespace_user(struct minijail *j)
+{
+ j->flags.userns = 1;
+}
+
+int API minijail_uidmap(struct minijail *j, const char *uidmap)
+{
+ j->uidmap = strdup(uidmap);
+ if (!j->uidmap)
+ return -ENOMEM;
+ return 0;
+}
+
+int API minijail_gidmap(struct minijail *j, const char *gidmap)
+{
+ j->gidmap = strdup(gidmap);
+ if (!j->gidmap)
+ return -ENOMEM;
+ return 0;
+}
+
void API minijail_inherit_usergroups(struct minijail *j)
{
j->flags.usergroups = 1;
@@ -584,6 +610,59 @@
return ret;
}
+static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
+{
+ int fd, ret, len;
+ size_t sz;
+ char fname[32];
+ close(pipe_fds[0]);
+
+ sz = sizeof(fname);
+ if (j->uidmap) {
+ ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
+ if (ret < 0 || ret >= sz)
+ die("failed to write file name of uid_map");
+ fd = open(fname, O_WRONLY);
+ if (fd < 0)
+ pdie("failed to open '%s'", fname);
+ len = strlen(j->uidmap);
+ if (write(fd, j->uidmap, len) < len)
+ die("failed to set uid_map");
+ close(fd);
+ }
+ if (j->gidmap) {
+ ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
+ if (ret < 0 || ret >= sz)
+ die("failed to write file name of gid_map");
+ fd = open(fname, O_WRONLY);
+ if (fd < 0)
+ pdie("failed to open '%s'", fname);
+ len = strlen(j->gidmap);
+ if (write(fd, j->gidmap, len) < len)
+ die("failed to set gid_map");
+ close(fd);
+ }
+
+ close(pipe_fds[1]);
+}
+
+static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
+{
+ char buf;
+
+ close(pipe_fds[1]);
+
+ /* Wait for parent to set up uid/gid mappings. */
+ if (read(pipe_fds[0], &buf, 1) != 0)
+ die("failed to sync with parent");
+ close(pipe_fds[0]);
+
+ if (j->uidmap && setresuid(0, 0, 0))
+ pdie("setresuid");
+ if (j->gidmap && setresgid(0, 0, 0))
+ pdie("setresgid");
+}
+
/* bind_one: Applies bindings from @b for @j, recursing as needed.
* @j Minijail these bindings are for
* @b Head of list of bindings
@@ -634,7 +713,7 @@
return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
}
-int remount_readonly(void)
+int remount_readonly(const struct minijail *j)
{
const char *kProcPath = "/proc";
const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
@@ -643,9 +722,10 @@
* /proc in our namespace, which means using MS_REMOUNT here would
* mutate our parent's mount as well, even though we're in a VFS
* namespace (!). Instead, remove their mount from our namespace
- * and make our own.
+ * and make our own. However, if we are in a new user namespace, /proc
+ * is not seen as mounted, so don't return error if umount() fails.
*/
- if (umount(kProcPath))
+ if (umount(kProcPath) && !j->flags.userns)
return -errno;
if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
return -errno;
@@ -816,7 +896,7 @@
if (j->flags.mount_tmp && mount_tmp())
pdie("mount_tmp");
- if (j->flags.readonly && remount_readonly())
+ if (j->flags.readonly && remount_readonly(j))
pdie("remount");
if (j->flags.caps) {
@@ -1046,6 +1126,7 @@
int stdin_fds[2];
int stdout_fds[2];
int stderr_fds[2];
+ int userns_pipe_fds[2];
int ret;
/* We need to remember this across the minijail_preexec() call. */
int pid_namespace = j->flags.pids;
@@ -1108,6 +1189,15 @@
return -EFAULT;
}
+ /*
+ * If we want to set up a new uid/gid mapping in the user namespace,
+ * create the pipe(2) to sync between parent and child.
+ */
+ if (j->flags.userns) {
+ if (pipe(userns_pipe_fds))
+ return -EFAULT;
+ }
+
/* Use sys_clone() if and only if we're creating a pid namespace.
*
* tl;dr: WARNING: do not mix pid namespaces and multithreading.
@@ -1148,8 +1238,12 @@
* problem is fixable or not. It would be nice if we worked in this
* case.
*/
- if (pid_namespace)
- child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
+ if (pid_namespace) {
+ int clone_flags = CLONE_NEWPID | SIGCHLD;
+ if (j->flags.userns)
+ clone_flags |= CLONE_NEWUSER;
+ child_pid = syscall(SYS_clone, clone_flags, NULL);
+ }
else
child_pid = fork();
@@ -1170,6 +1264,9 @@
j->initpid = child_pid;
+ if (j->flags.userns)
+ write_ugid_mappings(j, userns_pipe_fds);
+
/* Send marshalled minijail. */
close(pipe_fds[0]); /* read endpoint */
ret = minijail_to_fd(j, pipe_fds[1]);
@@ -1210,6 +1307,10 @@
}
free(oldenv_copy);
+
+ if (j->flags.userns)
+ enter_user_namespace(j, userns_pipe_fds);
+
/*
* If we want to write to the jailed process' standard input,
* set up the read end of the pipe.
@@ -1279,14 +1380,28 @@
char *const argv[])
{
pid_t child_pid;
+ int userns_pipe_fds[2];
int pid_namespace = j->flags.pids;
int do_init = j->flags.do_init;
if (j->flags.caps)
die("caps not supported with static targets");
- if (pid_namespace)
- child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
+ /*
+ * If we want to set up a new uid/gid mapping in the user namespace,
+ * create the pipe(2) to sync between parent and child.
+ */
+ if (j->flags.userns) {
+ if (pipe(userns_pipe_fds))
+ return -EFAULT;
+ }
+
+ if (pid_namespace) {
+ int clone_flags = CLONE_NEWPID | SIGCHLD;
+ if (j->flags.userns)
+ clone_flags |= CLONE_NEWUSER;
+ child_pid = syscall(SYS_clone, clone_flags, NULL);
+ }
else
child_pid = fork();
@@ -1295,9 +1410,16 @@
}
if (child_pid > 0 ) {
j->initpid = child_pid;
+
+ if (j->flags.userns)
+ write_ugid_mappings(j, userns_pipe_fds);
+
return 0;
}
+ if (j->flags.userns)
+ enter_user_namespace(j, userns_pipe_fds);
+
/*
* We can now drop this child into the sandbox
* then execve the target.
diff --git a/libminijail.h b/libminijail.h
index 324731f..33fbb16 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -55,6 +55,9 @@
* WARNING: this is NOT THREAD SAFE. See the block comment in </libminijail.c>.
*/
void minijail_namespace_pids(struct minijail *j);
+void minijail_namespace_user(struct minijail *j);
+int minijail_uidmap(struct minijail *j, const char *uidmap);
+int minijail_gidmap(struct minijail *j, const char *gidmap);
void minijail_remount_readonly(struct minijail *j);
void minijail_run_as_init(struct minijail *j);
void minijail_inherit_usergroups(struct minijail *j);
diff --git a/minijail0.c b/minijail0.c
index bb7d631..3f362f5 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -76,8 +76,9 @@
{
size_t i;
- printf("Usage: %s [-Ghinprsvt] [-b <src>,<dest>[,<writeable>]] "
+ printf("Usage: %s [-GhiInprsvtU] [-b <src>,<dest>[,<writeable>]] "
"[-c <caps>] [-C <dir>] [-g <group>] [-S <file>] [-u <user>] "
+ "[-m <uid> <loweruid> <count>] [-M <gid> <lowergid> <count>] "
"<program> [args...]\n"
" -b: binds <src> to <dest> in chroot. Multiple "
"instances allowed\n"
@@ -98,6 +99,12 @@
printf("%s ", log_syscalls[i]);
printf("\n"
+ " -m: set the uid mapping of a user namespace (implies -pU).\n"
+ " Same arguments as newuidmap(1)\n"
+ " Not compatible with -b without writable\n"
+ " -M: set the gid mapping of a user namespace (implies -pU).\n"
+ " Same arguments as newgidmap(1)\n"
+ " Not compatible with -b without writable\n"
" -n: set no_new_privs\n"
" -p: enter new pid namespace (implies -vr)\n"
" -r: remount /proc read-only (implies -v)\n"
@@ -107,6 +114,7 @@
" Requires -n when not running as root\n"
" -t: mount tmpfs at /tmp inside chroot\n"
" -u <user>: change uid to <user>\n"
+ " -U enter new user namespace (implies -p)\n"
" -v: enter new mount namespace\n"
" -V <file>: enter specified mount namespace\n");
}
@@ -130,7 +138,7 @@
const char *filter_path;
if (argc > 1 && argv[1][0] != '-')
return 1;
- while ((opt = getopt(argc, argv, "u:g:sS:c:C:b:V:vrGhHinpLetI")) != -1) {
+ while ((opt = getopt(argc, argv, "u:g:sS:c:C:b:V:m:M:vrGhHinpLetIU")) != -1) {
switch (opt) {
case 'u':
set_user(j, optarg);
@@ -205,6 +213,26 @@
minijail_namespace_pids(j);
minijail_run_as_init(j);
break;
+ case 'U':
+ minijail_namespace_user(j);
+ minijail_namespace_pids(j);
+ break;
+ case 'm':
+ minijail_namespace_user(j);
+ minijail_namespace_pids(j);
+ if (0 != minijail_uidmap(j, optarg)) {
+ fprintf(stderr, "Could not set uidmap\n");
+ exit(1);
+ }
+ break;
+ case 'M':
+ minijail_namespace_user(j);
+ minijail_namespace_pids(j);
+ if (0 != minijail_gidmap(j, optarg)) {
+ fprintf(stderr, "Could not set gidmap\n");
+ exit(1);
+ }
+ break;
default:
usage(argv[0]);
exit(1);