minijail: add user namespace support

Since most of the operations can be done if we have |euid = 0| in the
new user namespace, we enter a new user namespace and become root
immediately after fork()/clone().
It is incompatible with -b and <writable> set to 0, since we are not
able to remount bind mounts as readonly in a user namespace.

BUG=chromium:517387
TEST=security_Minijail0 pass
TEST=`minijail0 -m "0 1000 1" -M "0 1000 1" -- /usr/bin/touch t`
TEST=file `t` has owner:group root:root in minijail
TEST=and chronos:chronos outside minijail

Change-Id: I48f888097be5211715c5a839eca6f8e43b9903dd
Reviewed-on: https://chromium-review.googlesource.com/291200
Reviewed-by: Jorge Lucangeli Obes <jorgelo@chromium.org>
Tested-by: Yu-hsi Chiang <yuhsi@google.com>
Commit-Queue: Nicolas Boichat <drinkcat@chromium.org>
Trybot-Ready: Nicolas Boichat <drinkcat@chromium.org>
diff --git a/libminijail.c b/libminijail.c
index 2e625d6..b170db0 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -86,6 +86,7 @@
 		int enter_vfs:1;
 		int pids:1;
 		int net:1;
+		int userns:1;
 		int seccomp:1;
 		int readonly:1;
 		int usergroups:1;
@@ -107,6 +108,8 @@
 	int filter_len;
 	int binding_count;
 	char *chrootdir;
+	char *uidmap;
+	char *gidmap;
 	struct sock_fprog *filter_prog;
 	struct binding *bindings_head;
 	struct binding *bindings_tail;
@@ -135,6 +138,7 @@
 	int vfs = j->flags.vfs;
 	int enter_vfs = j->flags.enter_vfs;
 	int readonly = j->flags.readonly;
+	int userns = j->flags.userns;
 	if (j->user)
 		free(j->user);
 	j->user = NULL;
@@ -143,6 +147,7 @@
 	j->flags.vfs = vfs;
 	j->flags.enter_vfs = enter_vfs;
 	j->flags.readonly = readonly;
+	j->flags.userns = userns;
 	/* Note, |pids| will already have been used before this call. */
 }
 
@@ -293,6 +298,27 @@
 	j->flags.readonly = 1;
 }
 
+void API minijail_namespace_user(struct minijail *j)
+{
+	j->flags.userns = 1;
+}
+
+int API minijail_uidmap(struct minijail *j, const char *uidmap)
+{
+	j->uidmap = strdup(uidmap);
+	if (!j->uidmap)
+		return -ENOMEM;
+	return 0;
+}
+
+int API minijail_gidmap(struct minijail *j, const char *gidmap)
+{
+	j->gidmap = strdup(gidmap);
+	if (!j->gidmap)
+		return -ENOMEM;
+	return 0;
+}
+
 void API minijail_inherit_usergroups(struct minijail *j)
 {
 	j->flags.usergroups = 1;
@@ -584,6 +610,59 @@
 	return ret;
 }
 
+static void write_ugid_mappings(const struct minijail *j, int *pipe_fds)
+{
+	int fd, ret, len;
+	size_t sz;
+	char fname[32];
+	close(pipe_fds[0]);
+
+	sz = sizeof(fname);
+	if (j->uidmap) {
+		ret = snprintf(fname, sz, "/proc/%d/uid_map", j->initpid);
+		if (ret < 0 || ret >= sz)
+			die("failed to write file name of uid_map");
+		fd = open(fname, O_WRONLY);
+		if (fd < 0)
+			pdie("failed to open '%s'", fname);
+		len = strlen(j->uidmap);
+		if (write(fd, j->uidmap, len) < len)
+			die("failed to set uid_map");
+		close(fd);
+	}
+	if (j->gidmap) {
+		ret = snprintf(fname, sz, "/proc/%d/gid_map", j->initpid);
+		if (ret < 0 || ret >= sz)
+			die("failed to write file name of gid_map");
+		fd = open(fname, O_WRONLY);
+		if (fd < 0)
+			pdie("failed to open '%s'", fname);
+		len = strlen(j->gidmap);
+		if (write(fd, j->gidmap, len) < len)
+			die("failed to set gid_map");
+		close(fd);
+	}
+
+	close(pipe_fds[1]);
+}
+
+static void enter_user_namespace(const struct minijail *j, int *pipe_fds)
+{
+	char buf;
+
+	close(pipe_fds[1]);
+
+	/* Wait for parent to set up uid/gid mappings. */
+	if (read(pipe_fds[0], &buf, 1) != 0)
+		die("failed to sync with parent");
+	close(pipe_fds[0]);
+
+	if (j->uidmap && setresuid(0, 0, 0))
+		pdie("setresuid");
+	if (j->gidmap && setresgid(0, 0, 0))
+		pdie("setresgid");
+}
+
 /* bind_one: Applies bindings from @b for @j, recursing as needed.
  * @j Minijail these bindings are for
  * @b Head of list of bindings
@@ -634,7 +713,7 @@
 	return mount("none", "/tmp", "tmpfs", 0, "size=64M,mode=777");
 }
 
-int remount_readonly(void)
+int remount_readonly(const struct minijail *j)
 {
 	const char *kProcPath = "/proc";
 	const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
@@ -643,9 +722,10 @@
 	 * /proc in our namespace, which means using MS_REMOUNT here would
 	 * mutate our parent's mount as well, even though we're in a VFS
 	 * namespace (!). Instead, remove their mount from our namespace
-	 * and make our own.
+	 * and make our own. However, if we are in a new user namespace, /proc
+	 * is not seen as mounted, so don't return error if umount() fails.
 	 */
-	if (umount(kProcPath))
+	if (umount(kProcPath) && !j->flags.userns)
 		return -errno;
 	if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
 		return -errno;
@@ -816,7 +896,7 @@
 	if (j->flags.mount_tmp && mount_tmp())
 		pdie("mount_tmp");
 
-	if (j->flags.readonly && remount_readonly())
+	if (j->flags.readonly && remount_readonly(j))
 		pdie("remount");
 
 	if (j->flags.caps) {
@@ -1046,6 +1126,7 @@
 	int stdin_fds[2];
 	int stdout_fds[2];
 	int stderr_fds[2];
+	int userns_pipe_fds[2];
 	int ret;
 	/* We need to remember this across the minijail_preexec() call. */
 	int pid_namespace = j->flags.pids;
@@ -1108,6 +1189,15 @@
 			return -EFAULT;
 	}
 
+	/*
+	 * If we want to set up a new uid/gid mapping in the user namespace,
+	 * create the pipe(2) to sync between parent and child.
+	 */
+	if (j->flags.userns) {
+		if (pipe(userns_pipe_fds))
+			return -EFAULT;
+	}
+
 	/* Use sys_clone() if and only if we're creating a pid namespace.
 	 *
 	 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
@@ -1148,8 +1238,12 @@
 	 * problem is fixable or not. It would be nice if we worked in this
 	 * case.
 	 */
-	if (pid_namespace)
-		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
+	if (pid_namespace) {
+		int clone_flags = CLONE_NEWPID | SIGCHLD;
+		if (j->flags.userns)
+			clone_flags |= CLONE_NEWUSER;
+		child_pid = syscall(SYS_clone, clone_flags, NULL);
+	}
 	else
 		child_pid = fork();
 
@@ -1170,6 +1264,9 @@
 
 		j->initpid = child_pid;
 
+		if (j->flags.userns)
+			write_ugid_mappings(j, userns_pipe_fds);
+
 		/* Send marshalled minijail. */
 		close(pipe_fds[0]);	/* read endpoint */
 		ret = minijail_to_fd(j, pipe_fds[1]);
@@ -1210,6 +1307,10 @@
 	}
 	free(oldenv_copy);
 
+
+	if (j->flags.userns)
+		enter_user_namespace(j, userns_pipe_fds);
+
 	/*
 	 * If we want to write to the jailed process' standard input,
 	 * set up the read end of the pipe.
@@ -1279,14 +1380,28 @@
 			    char *const argv[])
 {
 	pid_t child_pid;
+	int userns_pipe_fds[2];
 	int pid_namespace = j->flags.pids;
 	int do_init = j->flags.do_init;
 
 	if (j->flags.caps)
 		die("caps not supported with static targets");
 
-	if (pid_namespace)
-		child_pid = syscall(SYS_clone, CLONE_NEWPID | SIGCHLD, NULL);
+	/*
+	 * If we want to set up a new uid/gid mapping in the user namespace,
+	 * create the pipe(2) to sync between parent and child.
+	 */
+	if (j->flags.userns) {
+		if (pipe(userns_pipe_fds))
+			return -EFAULT;
+	}
+
+	if (pid_namespace) {
+		int clone_flags = CLONE_NEWPID | SIGCHLD;
+		if (j->flags.userns)
+			clone_flags |= CLONE_NEWUSER;
+		child_pid = syscall(SYS_clone, clone_flags, NULL);
+	}
 	else
 		child_pid = fork();
 
@@ -1295,9 +1410,16 @@
 	}
 	if (child_pid > 0 ) {
 		j->initpid = child_pid;
+
+		if (j->flags.userns)
+			write_ugid_mappings(j, userns_pipe_fds);
+
 		return 0;
 	}
 
+	if (j->flags.userns)
+		enter_user_namespace(j, userns_pipe_fds);
+
 	/*
 	 * We can now drop this child into the sandbox
 	 * then execve the target.
diff --git a/libminijail.h b/libminijail.h
index 324731f..33fbb16 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -55,6 +55,9 @@
  * WARNING: this is NOT THREAD SAFE. See the block comment in </libminijail.c>.
  */
 void minijail_namespace_pids(struct minijail *j);
+void minijail_namespace_user(struct minijail *j);
+int minijail_uidmap(struct minijail *j, const char *uidmap);
+int minijail_gidmap(struct minijail *j, const char *gidmap);
 void minijail_remount_readonly(struct minijail *j);
 void minijail_run_as_init(struct minijail *j);
 void minijail_inherit_usergroups(struct minijail *j);
diff --git a/minijail0.c b/minijail0.c
index bb7d631..3f362f5 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -76,8 +76,9 @@
 {
 	size_t i;
 
-	printf("Usage: %s [-Ghinprsvt] [-b <src>,<dest>[,<writeable>]] "
+	printf("Usage: %s [-GhiInprsvtU] [-b <src>,<dest>[,<writeable>]] "
 	       "[-c <caps>] [-C <dir>] [-g <group>] [-S <file>] [-u <user>] "
+	       "[-m <uid> <loweruid> <count>] [-M <gid> <lowergid> <count>] "
 	       "<program> [args...]\n"
 	       "  -b:         binds <src> to <dest> in chroot. Multiple "
 	       "instances allowed\n"
@@ -98,6 +99,12 @@
 		printf("%s ", log_syscalls[i]);
 
 	printf("\n"
+	       "  -m:         set the uid mapping of a user namespace (implies -pU).\n"
+	       "              Same arguments as newuidmap(1)\n"
+	       "              Not compatible with -b without writable\n"
+	       "  -M:         set the gid mapping of a user namespace (implies -pU).\n"
+	       "              Same arguments as newgidmap(1)\n"
+	       "              Not compatible with -b without writable\n"
 	       "  -n:         set no_new_privs\n"
 	       "  -p:         enter new pid namespace (implies -vr)\n"
 	       "  -r:         remount /proc read-only (implies -v)\n"
@@ -107,6 +114,7 @@
 	       "              Requires -n when not running as root\n"
 	       "  -t:         mount tmpfs at /tmp inside chroot\n"
 	       "  -u <user>:  change uid to <user>\n"
+	       "  -U          enter new user namespace (implies -p)\n"
 	       "  -v:         enter new mount namespace\n"
 	       "  -V <file>:  enter specified mount namespace\n");
 }
@@ -130,7 +138,7 @@
 	const char *filter_path;
 	if (argc > 1 && argv[1][0] != '-')
 		return 1;
-	while ((opt = getopt(argc, argv, "u:g:sS:c:C:b:V:vrGhHinpLetI")) != -1) {
+	while ((opt = getopt(argc, argv, "u:g:sS:c:C:b:V:m:M:vrGhHinpLetIU")) != -1) {
 		switch (opt) {
 		case 'u':
 			set_user(j, optarg);
@@ -205,6 +213,26 @@
 			minijail_namespace_pids(j);
 			minijail_run_as_init(j);
 			break;
+		case 'U':
+			minijail_namespace_user(j);
+			minijail_namespace_pids(j);
+			break;
+		case 'm':
+			minijail_namespace_user(j);
+			minijail_namespace_pids(j);
+			if (0 != minijail_uidmap(j, optarg)) {
+				fprintf(stderr, "Could not set uidmap\n");
+				exit(1);
+			}
+			break;
+		case 'M':
+			minijail_namespace_user(j);
+			minijail_namespace_pids(j);
+			if (0 != minijail_gidmap(j, optarg)) {
+				fprintf(stderr, "Could not set gidmap\n");
+				exit(1);
+			}
+			break;
 		default:
 			usage(argv[0]);
 			exit(1);