RFC: minijail: add libminijail.

Drewry requested an implementation of minijail that:

1) Would be linkable against C programs
2) Not depend on libbase
3) Supply the necessary LD_PRELOAD hacks to use his syscall-filtering framework
   without the apply-after-exec hack and to use ptrace-disable.

Thoughts?

BUG=chromium-os:17937
TEST=Adhoc (extremely ;)). Proper test suite to be written; crosbug.com/18834

Change-Id: I8b34557a9a231dad75827c1a3d11f235f712648d
Signed-off-by: Elly Jones <ellyjones@chromium.org>
Reviewed-on: http://gerrit.chromium.org/gerrit/4585
Reviewed-by: Will Drewry <wad@chromium.org>
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..0a124ca
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,20 @@
+# Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+PRELOADPATH ?= \"/lib/libminijailpreload.so\"
+CFLAGS += -fPIC -Wall -Wextra -Werror -DPRELOADPATH="$(PRELOADPATH)"
+
+all : minijail0 libminijailpreload.so
+
+minijail0 : libminijail.o minijail0.c
+	$(CC) $(CFLAGS) -o $@ $^ -lcap
+
+libminijailpreload.so : libminijailpreload.c libminijail.o
+	$(CC) $(CFLAGS) -shared -o $@ $^ -ldl -lcap
+
+libminijail.o : libminijail.c libminijail.h
+
+install : minijail0 libminijailpreload.so
+	install -D minijail0 $(DESTDIR)usr/sbin/minijail0
+	install -D libminijailpreload.so $(DESTDIR)lib/libminijailpreload.so
diff --git a/libminijail-private.h b/libminijail-private.h
new file mode 100644
index 0000000..a304485
--- /dev/null
+++ b/libminijail-private.h
@@ -0,0 +1,15 @@
+/* libminijail-private.h
+ * Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * Values shared between libminijailpreload and libminijail, but not visible to
+ * the outside world.
+ */
+
+#ifndef LIBMINIJAIL_PRIVATE_H
+#define LIBMINIJAIL_PRIVATE_H
+
+static const char *kCommandEnvVar = "__MINIJAIL_PRELOAD";
+
+#endif /* !LIBMINIJAIL_PRIVATE_H */
diff --git a/libminijail.c b/libminijail.c
new file mode 100644
index 0000000..3c5ec58
--- /dev/null
+++ b/libminijail.c
@@ -0,0 +1,387 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file. */
+
+#define _BSD_SOURCE
+#define _GNU_SOURCE
+#include <errno.h>
+#include <grp.h>
+#include <inttypes.h>
+#include <linux/capability.h>
+#include <linux/securebits.h>
+#include <pwd.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <syslog.h>
+#include <unistd.h>
+
+#include "libminijail.h"
+#include "libminijail-private.h"
+
+struct minijail {
+  struct {
+    int uid : 1;
+    int gid : 1;
+    int caps : 1;
+    int vfs : 1;
+    int pids : 1;
+    int seccomp : 1;
+    int readonly : 1;
+    int usergroups : 1;
+    int ptrace : 1;
+  } flags;
+  uid_t uid;
+  gid_t gid;
+  gid_t usergid;
+  const char *user;
+  uint64_t caps;
+  pid_t initpid;
+};
+
+static void pdie(const char *failed) {
+  syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno));
+  abort();
+}
+
+static void die(const char *failed) {
+  syslog(LOG_ERR, "libminijail: %s", failed);
+  abort();
+}
+
+struct minijail *minijail_new(void) {
+  struct minijail *j = malloc(sizeof(*j));
+  if (j)
+    memset(j, 0, sizeof(*j));
+  return j;
+}
+
+void minijail_change_uid(struct minijail *j, uid_t uid) {
+  if (uid == 0)
+    die("useless change to uid 0");
+  j->uid = uid;
+  j->flags.uid = 1;
+}
+
+void minijail_change_gid(struct minijail *j, gid_t gid) {
+  if (gid == 0)
+    die("useless change to gid 0");
+  j->gid = gid;
+  j->flags.gid = 1;
+}
+
+int minijail_change_user(struct minijail *j, const char *user) {
+  /* In principle this should use getpwnam(), but:
+   * 1) getpwnam_r() isn't actually reentrant anyway, since it uses a
+   *    statically-allocated file descriptor internally
+   * 2) fgetpwnam() (by analogy with fgetpwent) would solve (1) except that it
+   *    doesn't exist
+   * 3) sysconf() (see getpwnam_r(3)) is allowed to return a size that is not
+   *    large enough, which means having to loop on growing the buffer we pass
+   *    in
+   */
+  struct passwd *pw = getpwnam(user);
+  if (!pw)
+    return errno;
+  minijail_change_uid(j, pw->pw_uid);
+  j->user = user;
+  j->usergid = pw->pw_gid;
+  return 0;
+}
+
+int minijail_change_group(struct minijail *j, const char *group) {
+  /* In principle this should use getgrnam(), but:
+   * 1) getgrnam_r() isn't actually reentrant anyway, since it uses a
+   *    statically-allocated file descriptor internally
+   * 2) fgetgrnam() (by analogy with fgetgrent) would solve (1) except that it
+   *    doesn't exist
+   * 3) sysconf() (see getgrnam_r(3)) is allowed to return a size that is not
+   *    large enough, which means having to loop on growing the buffer we pass
+   *    in
+   */
+  struct group *gr = getgrnam(group);
+  if (!gr)
+    return errno;
+  minijail_change_gid(j, gr->gr_gid);
+  return 0;
+}
+
+void minijail_use_seccomp(struct minijail *j) {
+  j->flags.seccomp = 1;
+}
+
+void minijail_use_caps(struct minijail *j, uint64_t capmask) {
+  j->caps = capmask;
+  j->flags.caps = 1;
+}
+
+void minijail_namespace_vfs(struct minijail *j) {
+  j->flags.vfs = 1;
+}
+
+void minijail_namespace_pids(struct minijail *j) {
+  j->flags.pids = 1;
+}
+
+void minijail_remount_readonly(struct minijail *j) {
+  j->flags.vfs = 1;
+  j->flags.readonly = 1;
+}
+
+void minijail_inherit_usergroups(struct minijail *j) {
+  j->flags.usergroups = 1;
+}
+
+void minijail_disable_ptrace(struct minijail *j) {
+  j->flags.ptrace = 1;
+}
+
+static int remount_readonly(void) {
+  const char *kProcPath = "/proc";
+  const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
+  /* Right now, we're holding a reference to our parent's old mount of /proc in
+   * our namespace, which means using MS_REMOUNT here would mutate our parent's
+   * mount as well, even though we're in a VFS namespace (!). Instead, remove
+   * their mount from our namespace and make our own. */
+  if (umount(kProcPath))
+    return errno;
+  if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
+    return errno;
+  return 0;
+}
+
+static void drop_caps(const struct minijail *j) {
+  cap_t caps = cap_get_proc();
+  cap_value_t raise_flag[1];
+  unsigned int i;
+  if (!caps)
+    die("can't get process caps");
+  if (cap_clear_flag(caps, CAP_INHERITABLE))
+    die("can't clear inheritable caps");
+  if (cap_clear_flag(caps, CAP_EFFECTIVE))
+    die("can't clear effective caps");
+  if (cap_clear_flag(caps, CAP_PERMITTED))
+    die("can't clear permitted caps");
+  for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
+    if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
+      continue;
+    raise_flag[0] = i;
+    if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
+      die("can't add effective cap");
+    if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
+      die("can't add permitted cap");
+    if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
+      die("can't add inheritable cap");
+  }
+  if (cap_set_proc(caps))
+    die("can't apply cleaned capset");
+  cap_free(caps);
+  for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
+    if (j->caps & (1 << i))
+      continue;
+    if (prctl(PR_CAPBSET_DROP, i))
+      pdie("prctl(PR_CAPBSET_DROP)");
+  }
+}
+
+void minijail_enter(const struct minijail *j) {
+  if (j->flags.pids)
+    die("tried to enter a pid-namespaced jail; try minijail_run()?");
+
+  if (j->flags.usergroups && !j->user)
+    die("usergroup inheritance without username");
+
+  /* We can't recover from failures if we've dropped privileges partially,
+   * so we don't even try. If any of our operations fail, we abort() the
+   * entire process. */
+  if (j->flags.vfs && unshare(CLONE_NEWNS))
+    pdie("unshare");
+
+  if (j->flags.readonly && remount_readonly())
+    pdie("remount");
+
+  if (j->flags.caps) {
+    /* POSIX capabilities are a bit tricky. If we drop our capability to change
+     * uids, our attempt to use setuid() below will fail. Hang on to root caps
+     * across setuid(), then lock securebits. */
+    if (prctl(PR_SET_KEEPCAPS, 1))
+      pdie("prctl(PR_SET_KEEPCAPS)");
+    if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
+      pdie("prctl(PR_SET_SECUREBITS)");
+  }
+
+  if (j->flags.usergroups && initgroups(j->user, j->usergid))
+    pdie("initgroups");
+  else if (!j->flags.usergroups && setgroups(0, NULL))
+    pdie("setgroups");
+
+  if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
+    pdie("setresgid");
+
+  if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
+    pdie("setresuid");
+
+  if (j->flags.caps)
+    drop_caps(j);
+
+  /* seccomp has to come last since it cuts off all the other
+   * privilege-dropping syscalls :) */
+  if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
+    pdie("prctl(PR_SET_SECCOMP)");
+}
+
+static int init_exitstatus = 0;
+
+static void init_term(int __attribute__((unused)) sig) {
+  _exit(init_exitstatus);
+}
+
+static int init(pid_t rootpid) {
+  pid_t pid;
+  int status;
+  signal(SIGTERM, init_term); /* so that we exit with the right status */
+  while ((pid = wait(&status)) > 0) {
+    /* This loop will only end when either there are no processes left inside
+     * our pid namespace or we get a signal. */
+    if (pid == rootpid)
+      init_exitstatus = status;
+  }
+  if (!WIFEXITED(init_exitstatus))
+    _exit(MINIJAIL_ERR_INIT);
+  _exit(WEXITSTATUS(init_exitstatus));
+}
+
+/** @brief Move any commands that need to be done post-exec into an environment
+ *         variable
+ *  @param j Jail to move commands from.
+ *
+ *  Serializes post-exec() commands into a string, removes them from the jail,
+ *  and adds them to the environment; they will be deserialized later (see
+ *  __minijail_preloaded) and executed inside the execve()'d process.
+ */
+static int move_commands_to_env(struct minijail *j) {
+  const int kEnvBufSize = 256;
+  const char *ptrace = j->flags.ptrace ? "ptrace " : "";
+  const char *seccomp = j->flags.seccomp ? "seccomp " : "";
+  char setuid[64] = "";
+  char caps[32] = "";
+  char *newenv;
+  char *oldenv;
+  char *envbuf = malloc(kEnvBufSize);
+  int r;
+
+  if (!envbuf)
+    return -ENOMEM;
+
+  if (j->flags.caps)
+    snprintf(caps, sizeof(caps), "caps=%" PRIx64 " ", j->caps);
+
+  if (j->flags.uid && j->flags.caps) {
+    snprintf(setuid, sizeof(setuid), "uid=%d ", j->uid);
+    j->flags.uid = 0;
+  }
+
+  j->flags.caps = 0;
+  j->flags.ptrace = 0;
+  j->flags.seccomp = 0;
+
+  r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
+  if (!r) {
+    /* No commands generated, so no preload needed :) */
+    free(envbuf);
+    return 0;
+  }
+  if (r == kEnvBufSize) {
+    free(envbuf);
+    return -E2BIG;
+  }
+
+  oldenv = getenv("LD_PRELOAD") ? : "";
+  newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
+  if (!newenv) {
+    free(envbuf);
+    return -ENOMEM;
+  }
+
+  /* Only insert a separating space if we have something to separate... */
+  sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH);
+
+  /* setenv() makes a copy of the string we give it */
+  setenv("LD_PRELOAD", newenv, 1);
+  setenv(kCommandEnvVar, envbuf, 1);
+  free(newenv);
+  free(envbuf);
+  return 0;
+}
+
+int minijail_run(struct minijail *j, const char *filename, char *const argv[]) {
+  unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
+  pid_t r;
+  r = move_commands_to_env(j);
+  if (r)
+    return r;
+
+  r = syscall(SYS_clone, pidns | SIGCHLD, NULL);
+  if (r > 0) {
+    j->initpid = r;
+    return 0;
+  }
+  if (r < 0)
+    return r;
+
+  j->flags.pids = 0;
+
+  /* Jail this process and its descendants... */
+  minijail_enter(j);
+
+  if (pidns) {
+    /* pid namespace: this process will become init inside the new namespace, so
+     * fork off a child to actually run the program (we don't want all programs
+     * we might exec to have to know how to be init). */
+    r = fork();
+    if (r < 0)
+      _exit(r);
+    else if (r > 0)
+      init(r);  /* never returns */
+  }
+
+  /* If we aren't pid-namespaced:
+   *   calling process
+   *   -> execve()-ing process
+   * If we are:
+   *   calling process
+   *   -> init()-ing process
+   *      -> execve()-ing process
+   */
+  _exit(execve(filename, argv, environ));
+}
+
+int minijail_kill(struct minijail *j) {
+  int st;
+  if (kill(j->initpid, SIGTERM))
+    return errno;
+  if (waitpid(j->initpid, &st, 0) < 0)
+    return errno;
+  return st;
+}
+
+int minijail_wait(struct minijail *j) {
+  int st;
+  if (waitpid(j->initpid, &st, 0) < 0)
+    return errno;
+  if (!WIFEXITED(st))
+    return MINIJAIL_ERR_JAIL;
+  return WEXITSTATUS(st);
+}
+
+void minijail_destroy(struct minijail *j) {
+  free(j);
+}
+
diff --git a/libminijail.h b/libminijail.h
new file mode 100644
index 0000000..0df119e
--- /dev/null
+++ b/libminijail.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file. */
+
+/* The general pattern of use here:
+ * 1) Construct a minijail with minijail_new()
+ * 2) Apply the desired restrictions to it
+ * 3) Enter it, which locks the current process inside it, or:
+ * 3) Run a process inside it
+ * 4) Destroy it.
+ */
+
+#ifndef LIBMINIJAIL_H_
+#define LIBMINIJAIL_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+  MINIJAIL_ERR_PRELOAD = 252,
+  MINIJAIL_ERR_JAIL = 253,
+  MINIJAIL_ERR_INIT = 254,
+};
+
+struct minijail;
+
+/* Allocates a new minijail with no restrictions. */
+struct minijail *minijail_new(void);
+
+/* These functions add restrictions to the minijail. They are not applied until
+ * minijail_enter() is called. See the documentation in minijail0.1 for
+ * explanations in detail of what the restrictions do. */
+void minijail_change_uid(struct minijail *j, uid_t uid);
+void minijail_change_gid(struct minijail *j, gid_t gid);
+/* 'user' should be kept valid until minijail_destroy() */
+int minijail_change_user(struct minijail *j, const char *user);
+/* 'group' should be kept valid until minijail_destroy() */
+int minijail_change_group(struct minijail *j, const char *group);
+void minijail_use_seccomp(struct minijail *j);
+void minijail_use_caps(struct minijail *j, uint64_t capmask);
+void minijail_namespace_vfs(struct minijail *j);
+void minijail_namespace_pids(struct minijail *j);
+void minijail_remount_readonly(struct minijail *j);
+void minijail_inherit_usergroups(struct minijail *j);
+void minijail_disable_ptrace(struct minijail *j);
+
+/* Lock this process into the given minijail. Note that this procedure cannot fail,
+ * since there is no way to undo privilege-dropping; therefore, if any part of
+ * the privilege-drop fails, minijail_enter() will abort the entire process.
+ *
+ * Some restrictions cannot be enabled this way (pid namespaces) and attempting
+ * to do so will cause an abort.
+ */
+void minijail_enter(const struct minijail *j);
+
+/* Run the specified command in the given minijail, execve(3)-style. This is
+ * required if minijail_namespace_pids() was used. */
+int minijail_run(struct minijail *j, const char *filename, char *const argv[]);
+
+/* Kill the specified minijail. The minijail must have been created with pid
+ * namespacing; if it was, all processes inside it are atomically killed. */
+int minijail_kill(struct minijail *j);
+
+/* Wait for all processed in the specified minijail to exit. Returns the exit
+ * status of the _first_ process spawned in the jail. */
+int minijail_wait(struct minijail *j);
+
+/* Frees the given minijail. It does not matter if the process is inside the minijail or
+ * not. */
+void minijail_destroy(struct minijail *j);
+
+#ifdef __cplusplus
+};  /* extern "C" */
+#endif
+
+#endif /* !LIBMINIJAIL_H_ */
diff --git a/libminijailpreload.c b/libminijailpreload.c
new file mode 100644
index 0000000..975c335
--- /dev/null
+++ b/libminijailpreload.c
@@ -0,0 +1,146 @@
+/* libminijailpreload.c - preload hack library
+ * Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ *
+ * This library is preloaded into every program launched by minijail_run().
+ * DO NOT EXPORT ANY SYMBOLS FROM THIS LIBRARY. They will replace other symbols
+ * in the programs it is preloaded into and cause impossible-to-debug failures.
+ * See the minijail0.1 for a design explanation. */
+
+#include "libminijail.h"
+#include "libminijail-private.h"
+
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <unistd.h>
+
+static int (*real_main)(int, char **, char **) = NULL;
+static void *libc_handle = NULL;
+
+static void die(const char *failed) {
+  syslog(LOG_ERR, "libminijail: %s", failed);
+  abort();
+}
+
+static void unset_in_env(char **envp, const char *name) {
+  int i;
+  for (i = 0; envp[i]; i++)
+    if (!strncmp(envp[i], name, strlen(name)))
+      envp[i][0] = '\0';
+}
+
+static void splitarg(char *str, char **key, char **val) {
+  *key = strsep(&str, "=");
+  *val = strsep(&str, "");
+}
+
+/** @brief Fake main(), spliced in before the real call to main() by
+ *         __libc_start_main (see below).
+ *  We get serialized commands from our invoking process in an environment
+ *  variable (kCommandEnvVar). The environment variable is a list of key=value
+ *  pairs (see move_commands_to_env); we use them to construct a jail, then
+ *  enter it.
+ */
+static int fake_main(int argc, char **argv, char **envp) {
+  char *args = getenv(kCommandEnvVar);
+  char *copy, *oldcopy;
+  char *arg;
+  struct minijail *j;
+  if (geteuid() != getuid() || getegid() != getgid())
+    /* If we didn't do this check, an attacker could set kCommandEnvVar for
+     * any setuid program that uses libminijail to cause it to get capabilities
+     * or a uid it did not expect. */
+    return MINIJAIL_ERR_PRELOAD;
+  if (!args)
+    return MINIJAIL_ERR_PRELOAD;
+  if (!(copy = strdup(args)))
+    die("preload: out of memory");
+  oldcopy = copy;
+  j = minijail_new();
+  if (!j)
+    die("preload: out of memory");
+  while ((arg = strsep(&copy, " "))) {
+    char *key, *val;
+    unsigned long v;
+    splitarg(arg, &key, &val);
+    if (!strcmp(key, "caps")) {
+      v = strtoul(val, NULL, 16);
+      minijail_use_caps(j, v);
+    }
+    else if (!strcmp(key, "ptrace"))
+      minijail_disable_ptrace(j);
+    else if (!strcmp(key, "uid")) {
+      v = atoi(val);
+      minijail_change_uid(j, v);
+    }
+    else if (!strcmp(key, "seccomp"))
+      minijail_use_seccomp(j);
+  }
+  /* TODO(ellyjones): this trashes existing preloads, so one can't do:
+   * LD_PRELOAD="/tmp/test.so libminijailpreload.so" prog; the descendants of
+   * prog will have no LD_PRELOAD set at all. */
+  unset_in_env(envp, "LD_PRELOAD");
+  minijail_enter(j);
+  minijail_destroy(j);
+  free(oldcopy);
+  dlclose(libc_handle);
+  return real_main(argc, argv, envp);
+}
+
+/** @brief LD_PRELOAD override of __libc_start_main.
+ *
+ *  It is really best if you do not look too closely at this function.
+ *  We need to ensure that some of our code runs before the target program (see
+ *  the minijail0.1 file in this directory for high-level details about this), and
+ *  the only available place to hook is this function, which is normally
+ *  responsible for calling main(). Our LD_PRELOAD will overwrite the real
+ *  __libc_start_main with this one, so we have to look up the real one from
+ *  libc and invoke it with a pointer to the fake main() we'd like to run before
+ *  the real main(). We can't just run our setup code *here* because
+ *  __libc_start_main is responsible for setting up the C runtime environment,
+ *  so we can't rely on things like malloc() being available yet.
+ */
+
+int __libc_start_main(int (*main) (int, char **, char **),
+                      int argc, char ** ubp_av, void (*init) (void),
+                      void (*fini) (void), void (*rtld_fini) (void),
+                      void (* stack_end)) {
+  void *sym;
+  /* This hack is unfortunately required by C99 - casting directly from void* to
+   * function pointers is left undefined. See POSIX.1-2003, the Rationale for
+   * the specification of dlsym(), and dlsym(3). This deliberately violates
+   * strict-aliasing rules, but gcc can't tell. */
+  union {
+    int (*fn)(int (*main) (int, char **, char **), int argc,
+                     char **ubp_av, void (*init) (void), void (*fini) (void),
+                     void (*rtld_fini) (void), void (* stack_end));
+    void *symval;
+  } real_libc_start_main;
+
+  /* We hold this handle for the duration of the real __libc_start_main() and
+   * drop it just before calling the real main(). */
+  libc_handle = dlopen("libc.so.6", RTLD_NOW);
+
+  if (!libc_handle) {
+    syslog(LOG_ERR, "can't dlopen() libc");
+    /* We dare not use abort() here because it will run atexit() handlers and
+     * try to flush stdio. */
+    _exit(1);
+  }
+  sym = dlsym(libc_handle, "__libc_start_main");
+  if (!sym) {
+    syslog(LOG_ERR, "can't find the real __libc_start_main()");
+    _exit(1);
+  }
+  real_libc_start_main.symval = sym;
+  real_main = main;
+
+  /* Note that we swap fake_main in for main - fake_main knows that it should
+   * call real_main after it's done. */
+  return real_libc_start_main.fn(fake_main, argc, ubp_av, init, fini, rtld_fini,
+                                 stack_end);
+}
diff --git a/minijail0.1 b/minijail0.1
new file mode 100644
index 0000000..15ceeca
--- /dev/null
+++ b/minijail0.1
@@ -0,0 +1,71 @@
+.TH MINIJAIL0 "1" "July 2011" "Chromium OS" "User Commands"
+.SH NAME
+minijail0 \- sandbox a process
+.SH SYNOPSIS
+.B minijail0
+[\fIOPTION\fR]... <\fIprogram\fR> [\fIargs\fR]...
+.SH DESCRIPTION
+.PP
+Runs PROGRAM inside a sandbox.
+.TP
+\fB-c <caps>\fR
+Restrict capabilities to \fIcaps\fR. When used in conjunction with \fB-u\fR and
+\fB-g\fR, this allows a program to have access to only certain parts of root's
+default privileges while running as another user and group ID altogether. Note
+that these capabilities are not inherited by subprocesses of the process given
+capabilities unless those subprocesses have POSIX file capabilities. See
+\fBcapabilities\fR(7).
+.TP
+\fB-G\fR
+Inherit all the supplementary groups of the user specified with \fB-u\fR. It
+is an error to use this option without having specified a \fBuser name\fR to
+\fB-u\fR.
+.TP
+\fB-g <group>\fR
+Change groups to \fIgroup\fR, which may be either a group name or a numeric
+group ID.
+.TP
+\fB-h\fR
+Print a help message.
+.TP
+\fB-p\fR
+Run inside a new PID namespace. This option will make it impossible for the
+program to see or affect processes that are not its descendants.
+.TP
+\fB-r\fR
+Remount certain filesystems readonly. Currently this only remounts /proc. This
+implies \fB-v\fR. Remounting /proc readonly means that even if the process has
+write access to a system config knob in /proc (e.g., in /sys/kernel), it cannot
+change the value.
+.TP
+\fB-s\fR
+Enable seccomp(2) in mode 1, which restricts the child process to a very small
+set of system calls. Support for more elaborate syscall filtering is coming.
+.TP
+\fB-u <user>\fR
+Change users to \fIuser\fR, which may be either a user name or a numeric user
+ID.
+.TP
+\fB-v\fR
+Run inside a new VFS namespace. This option makes the program's mountpoints
+independent of the rest of the system's.
+.SH IMPLEMENTATION
+This program is broken up into two parts: \fBminijail0\fR (the frontend) and a helper
+library called \fBlibminijailpreload\fR. Some jailings can only be achieved from
+the process to which they will actually apply - specifically capability use
+(since capabilities are not inherited to an exec'd process unless the exec'd
+process has POSIX file capabilities), seccomp (since we can't exec() once we're
+seccomp'd), and ptrace-disable (which is always cleared on exec().
+
+To this end, \fBlibminijailpreload\fR is forcibly loaded into all
+dynamically-linked target programs if any of these restrictions are in effect;
+we pass the specific restrictions in an environment variable which the preloaded
+library looks for. The forcibly-loaded library then applies the restrictions
+to the newly-loaded program.
+.SH AUTHOR
+Written by Elly Jones (ellyjones@chromium.org)
+.SH COPYRIGHT
+Copyright \(co 2011 The Chromium OS Authors
+License BSD-like.
+.SH "SEE ALSO"
+\fBlibminijail.h\fR
diff --git a/minijail0.c b/minijail0.c
new file mode 100644
index 0000000..654f332
--- /dev/null
+++ b/minijail0.c
@@ -0,0 +1,110 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libminijail.h"
+
+static void set_user(struct minijail *j, const char *arg) {
+  char *end = NULL;
+  int uid = strtod(arg, &end);
+  if (!*end && *arg) {
+    minijail_change_uid(j, uid);
+    return;
+  }
+
+  if (minijail_change_user(j, arg)) {
+    fprintf(stderr, "Bad user: '%s'\n", arg);
+    exit(1);
+  }
+}
+
+static void set_group(struct minijail *j, const char *arg) {
+  char *end = NULL;
+  int gid = strtod(arg, &end);
+  if (!*end && *arg) {
+    minijail_change_gid(j, gid);
+    return;
+  }
+
+  if (minijail_change_group(j, arg)) {
+    fprintf(stderr, "Bad group: '%s'\n", arg);
+    exit(1);
+  }
+}
+
+static void use_caps(struct minijail *j, const char *arg) {
+  uint64_t caps;
+  char *end = NULL;
+  caps = strtoull(arg, &end, 16);
+  if (*end) {
+    fprintf(stderr, "Invalid cap set: '%s'\n", arg);
+    exit(1);
+  }
+  minijail_use_caps(j, caps);
+}
+
+static void usage(const char *progn) {
+  printf("Usage: %s [-Ghprsv] [-c <caps>] [-g <group>] [-u <user>] <program> [args...]\n"
+         "  -c: restrict caps to <caps>\n"
+         "  -G: inherit groups from uid\n"
+         "  -g: change gid to <group>\n"
+         "  -h: help (this message)\n"
+         "  -p: use pid namespace\n"
+         "  -r: remount filesystems readonly (implies -v)\n"
+         "  -s: use seccomp\n"
+         "  -u: change uid to <user>\n"
+         "  -v: use vfs namespace\n", progn);
+}
+
+int main(int argc, char *argv[]) {
+  struct minijail *j = minijail_new();
+
+  int opt;
+  while ((opt = getopt(argc, argv, "u:g:sc:vrGhp")) != -1) {
+    switch (opt) {
+      case 'u':
+        set_user(j, optarg);
+        break;
+      case 'g':
+        set_group(j, optarg);
+        break;
+      case 's':
+        minijail_use_seccomp(j);
+        break;
+      case 'c':
+        use_caps(j, optarg);
+        break;
+      case 'v':
+        minijail_namespace_vfs(j);
+        break;
+      case 'r':
+        minijail_remount_readonly(j);
+        break;
+      case 'G':
+        minijail_inherit_usergroups(j);
+        break;
+      case 'p':
+        minijail_namespace_pids(j);
+        break;
+      default:
+        usage(argv[0]);
+        exit(1);
+    }
+  }
+
+  if (argc == optind) {
+    usage(argv[0]);
+    exit(1);
+  }
+
+  argc -= optind;
+  argv += optind;
+
+  minijail_run(j, argv[0], argv);
+  return minijail_wait(j);
+}