Improve the way uid/gid changes in unprivileged userns

This change uses whatever was passed into the -u/-g flags as the user to change
in the user namespace. This is used to fix an issue where calling open(2) on a
file on the tmpfs created by minijail would return EOVERFLOW[1]. An easy way to
reproduce is running this on a 4.8 kernel (or Ubuntu Xenial, which has this
change backported):

  $ ./minijail0 -T static -Ut -- /bin/bash -c 'touch /tmp/foo'

This change allows a non-zero uid/gid to be mapped to the current user when
entering a namespace, to avoid the above issue.

1: More information about the bug here:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1659087

Bug: None
Test: make tests
Test: ./minijail0 -T static -Ut -u 1000 -g 1000 -M -m -- \
      /bin/bash -c 'touch /tmp/foo'
Change-Id: I393daaf8c2b2355e33c75a908345bb03f1980271
diff --git a/libminijail.c b/libminijail.c
index 0eca595..9915afe 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -13,7 +13,6 @@
 #include <fcntl.h>
 #include <grp.h>
 #include <linux/capability.h>
-#include <pwd.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdbool.h>
@@ -294,66 +293,26 @@
 
 int API minijail_change_user(struct minijail *j, const char *user)
 {
-	char *buf = NULL;
-	struct passwd pw;
-	struct passwd *ppw = NULL;
-	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
-	if (sz == -1)
-		sz = 65536;	/* your guess is as good as mine... */
-
-	/*
-	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
-	 * the maximum needed size of the buffer, so we don't have to search.
-	 */
-	buf = malloc(sz);
-	if (!buf)
-		return -ENOMEM;
-	getpwnam_r(user, &pw, buf, sz, &ppw);
-	/*
-	 * We're safe to free the buffer here. The strings inside |pw| point
-	 * inside |buf|, but we don't use any of them; this leaves the pointers
-	 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
-	 * succeeded.
-	 */
-	free(buf);
-	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
-	if (!ppw)
-		return -1;
-	minijail_change_uid(j, ppw->pw_uid);
+	uid_t uid;
+	gid_t gid;
+	int rc = lookup_user(user, &uid, &gid);
+	if (rc)
+		return rc;
+	minijail_change_uid(j, uid);
 	j->user = strdup(user);
 	if (!j->user)
 		return -ENOMEM;
-	j->usergid = ppw->pw_gid;
+	j->usergid = gid;
 	return 0;
 }
 
 int API minijail_change_group(struct minijail *j, const char *group)
 {
-	char *buf = NULL;
-	struct group gr;
-	struct group *pgr = NULL;
-	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
-	if (sz == -1)
-		sz = 65536;	/* and mine is as good as yours, really */
-
-	/*
-	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
-	 * the maximum needed size of the buffer, so we don't have to search.
-	 */
-	buf = malloc(sz);
-	if (!buf)
-		return -ENOMEM;
-	getgrnam_r(group, &gr, buf, sz, &pgr);
-	/*
-	 * We're safe to free the buffer here. The strings inside gr point
-	 * inside buf, but we don't use any of them; this leaves the pointers
-	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
-	 */
-	free(buf);
-	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
-	if (!pgr)
-		return -1;
-	minijail_change_gid(j, pgr->gr_gid);
+	gid_t gid;
+	int rc = lookup_group(group, &gid);
+	if (rc)
+		return rc;
+	minijail_change_gid(j, gid);
 	return 0;
 }
 
@@ -1458,10 +1417,16 @@
 
 static void enter_user_namespace(const struct minijail *j)
 {
-	if (j->uidmap && setresuid(0, 0, 0))
-		pdie("user_namespaces: setresuid(0, 0, 0) failed");
-	if (j->gidmap && setresgid(0, 0, 0))
-		pdie("user_namespaces: setresgid(0, 0, 0) failed");
+	int uid = j->flags.uid ? j->uid : 0;
+	int gid = j->flags.gid ? j->gid : 0;
+	if (j->gidmap && setresgid(gid, gid, gid)) {
+		pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid,
+		     gid);
+	}
+	if (j->uidmap && setresuid(uid, uid, uid)) {
+		pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid,
+		     uid);
+	}
 }
 
 static void parent_setup_complete(int *pipe_fds)
@@ -1500,10 +1465,12 @@
 	} else if (j->flags.set_suppl_gids) {
 		if (setgroups(j->suppl_gid_count, j->suppl_gid_list))
 			pdie("setgroups(suppl_gids) failed");
-	} else if (!j->flags.keep_suppl_gids) {
+	} else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) {
 		/*
 		 * Only attempt to clear supplementary groups if we are changing
-		 * users or groups.
+		 * users or groups, and if the caller did not request to disable
+		 * setgroups (used when entering a user namespace as a
+		 * non-privileged user).
 		 */
 		if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL))
 			pdie("setgroups(0, NULL) failed");
diff --git a/libminijail_unittest.cc b/libminijail_unittest.cc
index 77775b5..93e3790 100644
--- a/libminijail_unittest.cc
+++ b/libminijail_unittest.cc
@@ -358,6 +358,52 @@
   minijail_destroy(j);
 }
 
+TEST(Test,
+#if defined(__ANDROID__)
+// TODO(lhchavez): Android unit tests don't currently support entering user
+// namespaces as unprivileged users.
+DISABLED_test_tmpfs_userns
+#else
+test_tmpfs_userns
+#endif
+) {
+  int mj_run_ret;
+  int status;
+  char *argv[4];
+  char uidmap[128], gidmap[128];
+  constexpr uid_t kTargetUid = 1000;  // Any non-zero value will do.
+  constexpr gid_t kTargetGid = 1000;
+
+  struct minijail *j = minijail_new();
+
+  minijail_namespace_pids(j);
+  minijail_namespace_vfs(j);
+  minijail_mount_tmp(j);
+  minijail_run_as_init(j);
+
+  // Perform userns mapping.
+  minijail_namespace_user(j);
+  snprintf(uidmap, sizeof(uidmap), "%d %d 1", kTargetUid, getuid());
+  snprintf(gidmap, sizeof(gidmap), "%d %d 1", kTargetGid, getgid());
+  minijail_change_uid(j, kTargetUid);
+  minijail_change_gid(j, kTargetGid);
+  minijail_uidmap(j, uidmap);
+  minijail_gidmap(j, gidmap);
+  minijail_namespace_user_disable_setgroups(j);
+
+  argv[0] = (char*)kShellPath;
+  argv[1] = "-c";
+  argv[2] = "exec touch /tmp/foo";
+  argv[3] = NULL;
+  mj_run_ret = minijail_run_no_preload(j, argv[0], argv);
+  EXPECT_EQ(mj_run_ret, 0);
+
+  status = minijail_wait(j);
+  EXPECT_EQ(status, 0);
+
+  minijail_destroy(j);
+}
+
 TEST(Test, parse_size) {
   size_t size;
 
diff --git a/minijail0.c b/minijail0.c
index d47aad5..088bc27 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -8,40 +8,56 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/capability.h>
+#include <sys/types.h>
 #include <unistd.h>
 
 #include "libminijail.h"
 #include "libsyscalls.h"
 
 #include "elfparse.h"
+#include "system.h"
 #include "util.h"
 
 #define IDMAP_LEN 32U
 
-static void set_user(struct minijail *j, const char *arg)
+static void set_user(struct minijail *j, const char *arg, uid_t *out_uid,
+		     gid_t *out_gid)
 {
 	char *end = NULL;
 	int uid = strtod(arg, &end);
 	if (!*end && *arg) {
+		*out_uid = uid;
 		minijail_change_uid(j, uid);
 		return;
 	}
 
+	if (lookup_user(arg, out_uid, out_gid)) {
+		fprintf(stderr, "Bad user: '%s'\n", arg);
+		exit(1);
+	}
+
 	if (minijail_change_user(j, arg)) {
 		fprintf(stderr, "Bad user: '%s'\n", arg);
 		exit(1);
 	}
 }
 
-static void set_group(struct minijail *j, const char *arg)
+static void set_group(struct minijail *j, const char *arg, gid_t *out_gid)
 {
 	char *end = NULL;
 	int gid = strtod(arg, &end);
 	if (!*end && *arg) {
+		*out_gid = gid;
 		minijail_change_gid(j, gid);
 		return;
 	}
 
+	if (lookup_group(arg, out_gid)) {
+		fprintf(stderr, "Bad group: '%s'\n", arg);
+		exit(1);
+	}
+
 	if (minijail_change_group(j, arg)) {
 		fprintf(stderr, "Bad group: '%s'\n", arg);
 		exit(1);
@@ -97,8 +113,8 @@
 		exit(1);
 	}
 	if (minijail_rlimit(j, atoi(type), atoi(cur), atoi(max))) {
-		fprintf(stderr, "minijail_rlimit '%s,%s,%s' failed.\n",
-			type, cur, max);
+		fprintf(stderr, "minijail_rlimit '%s,%s,%s' failed.\n", type,
+			cur, max);
 		exit(1);
 	}
 }
@@ -135,6 +151,84 @@
 	return idmap;
 }
 
+static int has_cap_setgid()
+{
+	cap_t caps;
+	cap_flag_value_t cap_value;
+
+	if (!CAP_IS_SUPPORTED(CAP_SETGID))
+		return 0;
+
+	caps = cap_get_proc();
+	if (!caps) {
+		fprintf(stderr, "Could not get process' capabilities: %m\n");
+		exit(1);
+	}
+
+	if (cap_get_flag(caps, CAP_SETGID, CAP_EFFECTIVE, &cap_value)) {
+		fprintf(stderr, "Could not get the value of CAP_SETGID: %m\n");
+		exit(1);
+	}
+
+	if (cap_free(caps)) {
+		fprintf(stderr, "Could not free capabilities: %m\n");
+		exit(1);
+	}
+
+	return cap_value == CAP_SET;
+}
+
+static void set_ugid_mapping(struct minijail *j, int set_uidmap, uid_t uid,
+			     char *uidmap, int set_gidmap, gid_t gid,
+			     char *gidmap)
+{
+	if (set_uidmap) {
+		minijail_namespace_user(j);
+		minijail_namespace_pids(j);
+
+		if (!uidmap) {
+			/*
+			 * If no map is passed, map the current uid to the
+			 * chosen uid in the target namespace (or root, if none
+			 * was chosen).
+			 */
+			uidmap = build_idmap(uid, getuid());
+		}
+		if (0 != minijail_uidmap(j, uidmap)) {
+			fprintf(stderr, "Could not set uid map.\n");
+			exit(1);
+		}
+		free(uidmap);
+	}
+	if (set_gidmap) {
+		minijail_namespace_user(j);
+		minijail_namespace_pids(j);
+
+		if (!gidmap) {
+			/*
+			 * If no map is passed, map the current gid to the
+			 * chosen gid in the target namespace.
+			 */
+			gidmap = build_idmap(gid, getgid());
+		}
+		if (!has_cap_setgid()) {
+			/*
+			 * This means that we are not running as root,
+			 * so we also have to disable setgroups(2) to
+			 * be able to set the gid map.
+			 * See
+			 * http://man7.org/linux/man-pages/man7/user_namespaces.7.html
+			 */
+			minijail_namespace_user_disable_setgroups(j);
+		}
+		if (0 != minijail_gidmap(j, gidmap)) {
+			fprintf(stderr, "Could not set gid map.\n");
+			exit(1);
+		}
+		free(gidmap);
+	}
+}
+
 static void usage(const char *progn)
 {
 	size_t i;
@@ -242,7 +336,10 @@
 	int caps = 0, ambient_caps = 0;
 	int seccomp = -1;
 	const size_t path_max = 4096;
-	char *map;
+	uid_t uid = 0;
+	gid_t gid = 0;
+	char *uidmap = NULL, *gidmap = NULL;
+	int set_uidmap = 0, set_gidmap = 0;
 	size_t size;
 	const char *filter_path = NULL;
 	int log_to_stderr = 0;
@@ -263,17 +360,18 @@
 				  &longoption_index)) != -1) {
 		switch (opt) {
 		case 'u':
-			set_user(j, optarg);
+			set_user(j, optarg, &uid, &gid);
 			break;
 		case 'g':
-			set_group(j, optarg);
+			set_group(j, optarg, &gid);
 			break;
 		case 'n':
 			minijail_no_new_privs(j);
 			break;
 		case 's':
 			if (seccomp != -1 && seccomp != 1) {
-				fprintf(stderr, "Do not use -s & -S together.\n");
+				fprintf(stderr,
+					"Do not use -s & -S together.\n");
 				exit(1);
 			}
 			seccomp = 1;
@@ -281,7 +379,8 @@
 			break;
 		case 'S':
 			if (seccomp != -1 && seccomp != 2) {
-				fprintf(stderr, "Do not use -s & -S together.\n");
+				fprintf(stderr,
+					"Do not use -s & -S together.\n");
 				exit(1);
 			}
 			seccomp = 2;
@@ -419,49 +518,22 @@
 			minijail_namespace_pids(j);
 			break;
 		case 'm':
-			minijail_namespace_user(j);
-			minijail_namespace_pids(j);
-
-			if (optarg) {
-				map = strdup(optarg);
-			} else {
-				/*
-				 * If no map is passed, map the current uid to
-				 * root.
-				 */
-				map = build_idmap(0, getuid());
+			set_uidmap = 1;
+			if (uidmap) {
+				free(uidmap);
+				uidmap = NULL;
 			}
-			if (0 != minijail_uidmap(j, map)) {
-				fprintf(stderr, "Could not set uid map.\n");
-				exit(1);
-			}
-			free(map);
+			if (optarg)
+				uidmap = strdup(optarg);
 			break;
 		case 'M':
-			minijail_namespace_user(j);
-			minijail_namespace_pids(j);
-
-			if (optarg) {
-				map = strdup(optarg);
-			} else {
-				/*
-				 * If no map is passed, map the current gid to
-				 * root.
-				 * This means that we're likely *not* running as
-				 * root, so we also have to disable
-				 * setgroups(2) to be able to set the gid map.
-				 * See
-				 * http://man7.org/linux/man-pages/man7/user_namespaces.7.html
-				 */
-				minijail_namespace_user_disable_setgroups(j);
-
-				map = build_idmap(0, getgid());
+			set_gidmap = 1;
+			if (gidmap) {
+				free(gidmap);
+				gidmap = NULL;
 			}
-			if (0 != minijail_gidmap(j, map)) {
-				fprintf(stderr, "Could not set gid map.\n");
-				exit(1);
-			}
-			free(map);
+			if (optarg)
+				gidmap = strdup(optarg);
 			break;
 		case 'a':
 			if (0 != minijail_use_alt_syscall(j, optarg)) {
@@ -532,6 +604,12 @@
 		}
 	}
 
+	/* Set up uid/gid mapping. */
+	if (set_uidmap || set_gidmap) {
+		set_ugid_mapping(j, set_uidmap, uid, uidmap, set_gidmap, gid,
+				 gidmap);
+	}
+
 	/* Can only set ambient caps when using regular caps. */
 	if (ambient_caps && !caps) {
 		fprintf(stderr, "Can't set ambient capabilities (--ambient) "
diff --git a/system.c b/system.c
index 7d72eaa..11903a4 100644
--- a/system.c
+++ b/system.c
@@ -17,7 +17,9 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <grp.h>
 #include <net/if.h>
+#include <pwd.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
@@ -287,3 +289,73 @@
 	}
 	return chown(dest, uid, gid);
 }
+
+/*
+ * lookup_user: Gets the uid/gid for the given username.
+ */
+int lookup_user(const char *user, uid_t *uid, gid_t *gid)
+{
+	char *buf = NULL;
+	struct passwd pw;
+	struct passwd *ppw = NULL;
+	ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
+	if (sz == -1)
+		sz = 65536; /* your guess is as good as mine... */
+
+	/*
+	 * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
+	 * the maximum needed size of the buffer, so we don't have to search.
+	 */
+	buf = malloc(sz);
+	if (!buf)
+		return -ENOMEM;
+	getpwnam_r(user, &pw, buf, sz, &ppw);
+	/*
+	 * We're safe to free the buffer here. The strings inside |pw| point
+	 * inside |buf|, but we don't use any of them; this leaves the pointers
+	 * dangling but it's safe. |ppw| points at |pw| if getpwnam_r(3)
+	 * succeeded.
+	 */
+	free(buf);
+	/* getpwnam_r(3) does *not* set errno when |ppw| is NULL. */
+	if (!ppw)
+		return -1;
+
+	*uid = ppw->pw_uid;
+	*gid = ppw->pw_gid;
+	return 0;
+}
+
+/*
+ * lookup_group: Gets the gid for the given group name.
+ */
+int lookup_group(const char *group, gid_t *gid)
+{
+	char *buf = NULL;
+	struct group gr;
+	struct group *pgr = NULL;
+	ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
+	if (sz == -1)
+		sz = 65536; /* and mine is as good as yours, really */
+
+	/*
+	 * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
+	 * the maximum needed size of the buffer, so we don't have to search.
+	 */
+	buf = malloc(sz);
+	if (!buf)
+		return -ENOMEM;
+	getgrnam_r(group, &gr, buf, sz, &pgr);
+	/*
+	 * We're safe to free the buffer here. The strings inside gr point
+	 * inside buf, but we don't use any of them; this leaves the pointers
+	 * dangling but it's safe. pgr points at gr if getgrnam_r succeeded.
+	 */
+	free(buf);
+	/* getgrnam_r(3) does *not* set errno when |pgr| is NULL. */
+	if (!pgr)
+		return -1;
+
+	*gid = pgr->gr_gid;
+	return 0;
+}
diff --git a/system.h b/system.h
index 93537bf..2bdebe5 100644
--- a/system.h
+++ b/system.h
@@ -65,6 +65,9 @@
 int setup_mount_destination(const char *source, const char *dest, uid_t uid,
 			    uid_t gid, bool bind);
 
+int lookup_user(const char *user, uid_t *uid, gid_t *gid);
+int lookup_group(const char *group, gid_t *gid);
+
 #ifdef __cplusplus
 }; /* extern "C" */
 #endif