RFC: minijail: add libminijail.

Drewry requested an implementation of minijail that:

1) Would be linkable against C programs
2) Not depend on libbase
3) Supply the necessary LD_PRELOAD hacks to use his syscall-filtering framework
   without the apply-after-exec hack and to use ptrace-disable.

Thoughts?

BUG=chromium-os:17937
TEST=Adhoc (extremely ;)). Proper test suite to be written; crosbug.com/18834

Change-Id: I8b34557a9a231dad75827c1a3d11f235f712648d
Signed-off-by: Elly Jones <ellyjones@chromium.org>
Reviewed-on: http://gerrit.chromium.org/gerrit/4585
Reviewed-by: Will Drewry <wad@chromium.org>
diff --git a/libminijail.c b/libminijail.c
new file mode 100644
index 0000000..3c5ec58
--- /dev/null
+++ b/libminijail.c
@@ -0,0 +1,387 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file. */
+
+#define _BSD_SOURCE
+#define _GNU_SOURCE
+#include <errno.h>
+#include <grp.h>
+#include <inttypes.h>
+#include <linux/capability.h>
+#include <linux/securebits.h>
+#include <pwd.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <syslog.h>
+#include <unistd.h>
+
+#include "libminijail.h"
+#include "libminijail-private.h"
+
+struct minijail {
+  struct {
+    int uid : 1;
+    int gid : 1;
+    int caps : 1;
+    int vfs : 1;
+    int pids : 1;
+    int seccomp : 1;
+    int readonly : 1;
+    int usergroups : 1;
+    int ptrace : 1;
+  } flags;
+  uid_t uid;
+  gid_t gid;
+  gid_t usergid;
+  const char *user;
+  uint64_t caps;
+  pid_t initpid;
+};
+
+static void pdie(const char *failed) {
+  syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno));
+  abort();
+}
+
+static void die(const char *failed) {
+  syslog(LOG_ERR, "libminijail: %s", failed);
+  abort();
+}
+
+struct minijail *minijail_new(void) {
+  struct minijail *j = malloc(sizeof(*j));
+  if (j)
+    memset(j, 0, sizeof(*j));
+  return j;
+}
+
+void minijail_change_uid(struct minijail *j, uid_t uid) {
+  if (uid == 0)
+    die("useless change to uid 0");
+  j->uid = uid;
+  j->flags.uid = 1;
+}
+
+void minijail_change_gid(struct minijail *j, gid_t gid) {
+  if (gid == 0)
+    die("useless change to gid 0");
+  j->gid = gid;
+  j->flags.gid = 1;
+}
+
+int minijail_change_user(struct minijail *j, const char *user) {
+  /* In principle this should use getpwnam(), but:
+   * 1) getpwnam_r() isn't actually reentrant anyway, since it uses a
+   *    statically-allocated file descriptor internally
+   * 2) fgetpwnam() (by analogy with fgetpwent) would solve (1) except that it
+   *    doesn't exist
+   * 3) sysconf() (see getpwnam_r(3)) is allowed to return a size that is not
+   *    large enough, which means having to loop on growing the buffer we pass
+   *    in
+   */
+  struct passwd *pw = getpwnam(user);
+  if (!pw)
+    return errno;
+  minijail_change_uid(j, pw->pw_uid);
+  j->user = user;
+  j->usergid = pw->pw_gid;
+  return 0;
+}
+
+int minijail_change_group(struct minijail *j, const char *group) {
+  /* In principle this should use getgrnam(), but:
+   * 1) getgrnam_r() isn't actually reentrant anyway, since it uses a
+   *    statically-allocated file descriptor internally
+   * 2) fgetgrnam() (by analogy with fgetgrent) would solve (1) except that it
+   *    doesn't exist
+   * 3) sysconf() (see getgrnam_r(3)) is allowed to return a size that is not
+   *    large enough, which means having to loop on growing the buffer we pass
+   *    in
+   */
+  struct group *gr = getgrnam(group);
+  if (!gr)
+    return errno;
+  minijail_change_gid(j, gr->gr_gid);
+  return 0;
+}
+
+void minijail_use_seccomp(struct minijail *j) {
+  j->flags.seccomp = 1;
+}
+
+void minijail_use_caps(struct minijail *j, uint64_t capmask) {
+  j->caps = capmask;
+  j->flags.caps = 1;
+}
+
+void minijail_namespace_vfs(struct minijail *j) {
+  j->flags.vfs = 1;
+}
+
+void minijail_namespace_pids(struct minijail *j) {
+  j->flags.pids = 1;
+}
+
+void minijail_remount_readonly(struct minijail *j) {
+  j->flags.vfs = 1;
+  j->flags.readonly = 1;
+}
+
+void minijail_inherit_usergroups(struct minijail *j) {
+  j->flags.usergroups = 1;
+}
+
+void minijail_disable_ptrace(struct minijail *j) {
+  j->flags.ptrace = 1;
+}
+
+static int remount_readonly(void) {
+  const char *kProcPath = "/proc";
+  const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
+  /* Right now, we're holding a reference to our parent's old mount of /proc in
+   * our namespace, which means using MS_REMOUNT here would mutate our parent's
+   * mount as well, even though we're in a VFS namespace (!). Instead, remove
+   * their mount from our namespace and make our own. */
+  if (umount(kProcPath))
+    return errno;
+  if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
+    return errno;
+  return 0;
+}
+
+static void drop_caps(const struct minijail *j) {
+  cap_t caps = cap_get_proc();
+  cap_value_t raise_flag[1];
+  unsigned int i;
+  if (!caps)
+    die("can't get process caps");
+  if (cap_clear_flag(caps, CAP_INHERITABLE))
+    die("can't clear inheritable caps");
+  if (cap_clear_flag(caps, CAP_EFFECTIVE))
+    die("can't clear effective caps");
+  if (cap_clear_flag(caps, CAP_PERMITTED))
+    die("can't clear permitted caps");
+  for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
+    if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
+      continue;
+    raise_flag[0] = i;
+    if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
+      die("can't add effective cap");
+    if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
+      die("can't add permitted cap");
+    if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
+      die("can't add inheritable cap");
+  }
+  if (cap_set_proc(caps))
+    die("can't apply cleaned capset");
+  cap_free(caps);
+  for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
+    if (j->caps & (1 << i))
+      continue;
+    if (prctl(PR_CAPBSET_DROP, i))
+      pdie("prctl(PR_CAPBSET_DROP)");
+  }
+}
+
+void minijail_enter(const struct minijail *j) {
+  if (j->flags.pids)
+    die("tried to enter a pid-namespaced jail; try minijail_run()?");
+
+  if (j->flags.usergroups && !j->user)
+    die("usergroup inheritance without username");
+
+  /* We can't recover from failures if we've dropped privileges partially,
+   * so we don't even try. If any of our operations fail, we abort() the
+   * entire process. */
+  if (j->flags.vfs && unshare(CLONE_NEWNS))
+    pdie("unshare");
+
+  if (j->flags.readonly && remount_readonly())
+    pdie("remount");
+
+  if (j->flags.caps) {
+    /* POSIX capabilities are a bit tricky. If we drop our capability to change
+     * uids, our attempt to use setuid() below will fail. Hang on to root caps
+     * across setuid(), then lock securebits. */
+    if (prctl(PR_SET_KEEPCAPS, 1))
+      pdie("prctl(PR_SET_KEEPCAPS)");
+    if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
+      pdie("prctl(PR_SET_SECUREBITS)");
+  }
+
+  if (j->flags.usergroups && initgroups(j->user, j->usergid))
+    pdie("initgroups");
+  else if (!j->flags.usergroups && setgroups(0, NULL))
+    pdie("setgroups");
+
+  if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
+    pdie("setresgid");
+
+  if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
+    pdie("setresuid");
+
+  if (j->flags.caps)
+    drop_caps(j);
+
+  /* seccomp has to come last since it cuts off all the other
+   * privilege-dropping syscalls :) */
+  if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
+    pdie("prctl(PR_SET_SECCOMP)");
+}
+
+static int init_exitstatus = 0;
+
+static void init_term(int __attribute__((unused)) sig) {
+  _exit(init_exitstatus);
+}
+
+static int init(pid_t rootpid) {
+  pid_t pid;
+  int status;
+  signal(SIGTERM, init_term); /* so that we exit with the right status */
+  while ((pid = wait(&status)) > 0) {
+    /* This loop will only end when either there are no processes left inside
+     * our pid namespace or we get a signal. */
+    if (pid == rootpid)
+      init_exitstatus = status;
+  }
+  if (!WIFEXITED(init_exitstatus))
+    _exit(MINIJAIL_ERR_INIT);
+  _exit(WEXITSTATUS(init_exitstatus));
+}
+
+/** @brief Move any commands that need to be done post-exec into an environment
+ *         variable
+ *  @param j Jail to move commands from.
+ *
+ *  Serializes post-exec() commands into a string, removes them from the jail,
+ *  and adds them to the environment; they will be deserialized later (see
+ *  __minijail_preloaded) and executed inside the execve()'d process.
+ */
+static int move_commands_to_env(struct minijail *j) {
+  const int kEnvBufSize = 256;
+  const char *ptrace = j->flags.ptrace ? "ptrace " : "";
+  const char *seccomp = j->flags.seccomp ? "seccomp " : "";
+  char setuid[64] = "";
+  char caps[32] = "";
+  char *newenv;
+  char *oldenv;
+  char *envbuf = malloc(kEnvBufSize);
+  int r;
+
+  if (!envbuf)
+    return -ENOMEM;
+
+  if (j->flags.caps)
+    snprintf(caps, sizeof(caps), "caps=%" PRIx64 " ", j->caps);
+
+  if (j->flags.uid && j->flags.caps) {
+    snprintf(setuid, sizeof(setuid), "uid=%d ", j->uid);
+    j->flags.uid = 0;
+  }
+
+  j->flags.caps = 0;
+  j->flags.ptrace = 0;
+  j->flags.seccomp = 0;
+
+  r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
+  if (!r) {
+    /* No commands generated, so no preload needed :) */
+    free(envbuf);
+    return 0;
+  }
+  if (r == kEnvBufSize) {
+    free(envbuf);
+    return -E2BIG;
+  }
+
+  oldenv = getenv("LD_PRELOAD") ? : "";
+  newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
+  if (!newenv) {
+    free(envbuf);
+    return -ENOMEM;
+  }
+
+  /* Only insert a separating space if we have something to separate... */
+  sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH);
+
+  /* setenv() makes a copy of the string we give it */
+  setenv("LD_PRELOAD", newenv, 1);
+  setenv(kCommandEnvVar, envbuf, 1);
+  free(newenv);
+  free(envbuf);
+  return 0;
+}
+
+int minijail_run(struct minijail *j, const char *filename, char *const argv[]) {
+  unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
+  pid_t r;
+  r = move_commands_to_env(j);
+  if (r)
+    return r;
+
+  r = syscall(SYS_clone, pidns | SIGCHLD, NULL);
+  if (r > 0) {
+    j->initpid = r;
+    return 0;
+  }
+  if (r < 0)
+    return r;
+
+  j->flags.pids = 0;
+
+  /* Jail this process and its descendants... */
+  minijail_enter(j);
+
+  if (pidns) {
+    /* pid namespace: this process will become init inside the new namespace, so
+     * fork off a child to actually run the program (we don't want all programs
+     * we might exec to have to know how to be init). */
+    r = fork();
+    if (r < 0)
+      _exit(r);
+    else if (r > 0)
+      init(r);  /* never returns */
+  }
+
+  /* If we aren't pid-namespaced:
+   *   calling process
+   *   -> execve()-ing process
+   * If we are:
+   *   calling process
+   *   -> init()-ing process
+   *      -> execve()-ing process
+   */
+  _exit(execve(filename, argv, environ));
+}
+
+int minijail_kill(struct minijail *j) {
+  int st;
+  if (kill(j->initpid, SIGTERM))
+    return errno;
+  if (waitpid(j->initpid, &st, 0) < 0)
+    return errno;
+  return st;
+}
+
+int minijail_wait(struct minijail *j) {
+  int st;
+  if (waitpid(j->initpid, &st, 0) < 0)
+    return errno;
+  if (!WIFEXITED(st))
+    return MINIJAIL_ERR_JAIL;
+  return WEXITSTATUS(st);
+}
+
+void minijail_destroy(struct minijail *j) {
+  free(j);
+}
+