Integrate BPF seccomp_filters to Minijail.

BUG=chromium-os:25429
BUG=chromium-os:27878
TEST=security_Minijail_seccomp
CQ-DEPEND=I13a9b22ac8d55f02d5a77b5beedb955386b63723

Change-Id: I5fa8f40b9a539a61d69439cad778c926fc934cb1
Reviewed-on: https://gerrit.chromium.org/gerrit/19527
Tested-by: Jorge Lucangeli Obes <jorgelo@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Commit-Ready: Jorge Lucangeli Obes <jorgelo@chromium.org>
diff --git a/libminijail.c b/libminijail.c
index 0080c49..78cfbc1 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -6,6 +6,7 @@
 
 #define _BSD_SOURCE
 #define _GNU_SOURCE
+#include <asm/unistd.h>
 #include <ctype.h>
 #include <errno.h>
 #include <grp.h>
@@ -17,6 +18,7 @@
 #include <sched.h>
 #include <signal.h>
 #include <stdarg.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -25,6 +27,7 @@
 #include <sys/mount.h>
 #include <sys/param.h>
 #include <sys/prctl.h>
+#include <sys/user.h>
 #include <sys/wait.h>
 #include <syslog.h>
 #include <unistd.h>
@@ -33,13 +36,19 @@
 #include "libsyscalls.h"
 #include "libminijail-private.h"
 
+#include "syscall_filter.h"
+
 /* Until these are reliably available in linux/prctl.h */
-#ifndef PR_SET_SECCOMP_FILTER
-# define PR_SECCOMP_FILTER_SYSCALL 0
-# define PR_SECCOMP_FILTER_EVENT 1
-# define PR_GET_SECCOMP_FILTER 35
-# define PR_SET_SECCOMP_FILTER 36
-# define PR_CLEAR_SECCOMP_FILTER 37
+#ifndef PR_SET_SECCOMP
+# define PR_SET_SECCOMP	22
+#endif
+
+/* For seccomp_filter using BPF. */
+#ifndef PR_SET_NO_NEW_PRIVS
+# define PR_SET_NO_NEW_PRIVS 38
+#endif
+#ifndef SECCOMP_MODE_FILTER
+# define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
 #endif
 
 #define die(_msg, ...) do { \
@@ -53,12 +62,6 @@
 #define warn(_msg, ...) \
 	syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
 
-struct seccomp_filter {
-	int nr;
-	char *filter;
-	struct seccomp_filter *next, *prev;
-};
-
 struct binding {
 	char *src;
 	char *dest;
@@ -86,10 +89,10 @@
 	char *user;
 	uint64_t caps;
 	pid_t initpid;
-	int filter_count;
+	int filter_len;
 	int binding_count;
 	char *chrootdir;
-	struct seccomp_filter *filters;
+	struct sock_fprog *filter_prog;
 	struct binding *bindings_head;
 	struct binding *bindings_tail;
 };
@@ -279,106 +282,21 @@
 	return -ENOMEM;
 }
 
-int API minijail_add_seccomp_filter(struct minijail *j, int nr,
-                                    const char *filter)
-{
-	struct seccomp_filter *sf;
-	if (!filter || nr < 0)
-		return -EINVAL;
-
-	sf = malloc(sizeof(*sf));
-	if (!sf)
-		return -ENOMEM;
-	sf->nr = nr;
-	sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
-	if (!sf->filter) {
-		free(sf);
-		return -ENOMEM;
-	}
-
-	j->filter_count++;
-
-	if (!j->filters) {
-		j->filters = sf;
-		sf->next = sf;
-		sf->prev = sf;
-		return 0;
-	}
-	sf->next = j->filters;
-	sf->prev = j->filters->prev;
-	sf->prev->next = sf;
-	j->filters->prev = sf;
-	return 0;
-}
-
-int API minijail_lookup_syscall(const char *name)
-{
-	const struct syscall_entry *entry = syscall_table;
-	for (; entry->name && entry->nr >= 0; ++entry)
-		if (!strcmp(entry->name, name))
-			return entry->nr;
-	return -1;
-}
-
-char *strip(char *s)
-{
-	char *end;
-	while (*s && isblank(*s))
-		s++;
-	end = s + strlen(s) - 1;
-	while (*end && (isblank(*end) || *end == '\n'))
-		end--;
-	*(end + 1) = '\0';
-	return s;
-}
-
 void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
 {
 	FILE *file = fopen(path, "r");
-	char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
-	int count = 0;
-	if (!file)
-		pdie("failed to open seccomp filters file");
-
-	/*
-	 * Format is simple:
-	 * syscall_name<COLON><FILTER STRING>[\n|EOF]
-	 * #...comment...
-	 * <empty line?
-	 */
-	while (fgets(line, sizeof(line), file)) {
-		char *filter = line;
-		char *name = strsep(&filter, ":");
-		char *name_end = NULL;
-		int nr = -1;
-		count++;
-
-		/* Allow comment lines */
-		if (*name == '#')
-			continue;
-
-		name = strip(name);
-
-		if (!filter) {
-			if (strlen(name))
-				die("invalid filter on line %d", count);
-			/* Allow empty lines */
-			continue;
-		}
-
-		filter = strip(filter);
-
-		/* Take direct syscall numbers */
-		nr = strtol(name, &name_end, 0);
-		/* Or fail-over to using names */
-		if (*name_end != '\0')
-			nr = minijail_lookup_syscall(name);
-		if (nr < 0)
-			die("syscall '%s' unknown", name);
-
-		if (minijail_add_seccomp_filter(j, nr, filter))
-			pdie("failed to add filter for syscall '%s'", name);
+	if (!file) {
+		pdie("failed to open seccomp filters file '%s'", path);
 	}
+
+	struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
+	if (compile_filter(file, fprog)) {
+		die("failed to compile seccomp filters BPF program in '%s'", path);
+	}
+
+	j->filter_len = fprog->len;
+	j->filter_prog = fprog;
+
 	fclose(file);
 }
 
@@ -420,13 +338,10 @@
 		marshal_append(state, j->user, strlen(j->user) + 1);
 	if (j->chrootdir)
 		marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
-	if (j->flags.seccomp_filter && j->filters) {
-		struct seccomp_filter *f = j->filters;
-		do {
-			marshal_append(state, (char *)&f->nr, sizeof(f->nr));
-			marshal_append(state, f->filter, strlen(f->filter) + 1);
-			f = f->next;
-		} while (f != j->filters);
+	if (j->flags.seccomp_filter && j->filter_prog) {
+		struct sock_fprog *fp = j->filter_prog;
+		marshal_append(state, (char *)fp->filter,
+				fp->len * sizeof(struct sock_filter));
 	}
 	for (b = j->bindings_head; b; b = b->next) {
 		marshal_append(state, b->src, strlen(b->src) + 1);
@@ -496,7 +411,7 @@
 	/* Potentially stale pointers not used as signals. */
 	j->bindings_head = NULL;
 	j->bindings_tail = NULL;
-	j->filters = NULL;
+	j->filter_prog = NULL;
 
 	if (j->user) {		/* stale pointer */
 		char *user = consumestr(&serialized, &length);
@@ -516,22 +431,21 @@
 			goto bad_chrootdir;
 	}
 
-	if (j->flags.seccomp_filter && j->filter_count) {
-		count = j->filter_count;
-		/* Let add_seccomp_filter recompute the value. */
-		j->filter_count = 0;
-		for (; count > 0; --count) {
-			int *nr = (int *)consumebytes(sizeof(*nr), &serialized,
-			                              &length);
-			char *filter;
-			if (!nr)
-				goto bad_filters;
-			filter = consumestr(&serialized, &length);
-			if (!filter)
-				goto bad_filters;
-			if (minijail_add_seccomp_filter(j, *nr, filter))
-				goto bad_filters;
-		}
+	if (j->flags.seccomp_filter && j->filter_len > 0) {
+		size_t ninstrs = j->filter_len;
+		if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
+		    ninstrs > USHRT_MAX)
+			goto bad_filters;
+
+		size_t program_len = ninstrs * sizeof(struct sock_filter);
+		void *program = consumebytes(program_len, &serialized, &length);
+		if (!program)
+			goto bad_filters;
+
+		j->filter_prog = malloc(sizeof(struct sock_fprog));
+		j->filter_prog->len = ninstrs;
+		j->filter_prog->filter = malloc(program_len);
+		memcpy(j->filter_prog->filter, program, program_len);
 	}
 
 	count = j->binding_count;
@@ -555,6 +469,10 @@
 	return 0;
 
 bad_bindings:
+	if (j->flags.seccomp_filter && j->filter_len > 0) {
+		free(j->filter_prog->filter);
+		free(j->filter_prog);
+	}
 bad_filters:
 	if (j->chrootdir)
 		free(j->chrootdir);
@@ -686,62 +604,12 @@
 	}
 }
 
-int setup_seccomp_filters(const struct minijail *j)
-{
-	const struct seccomp_filter *sf = j->filters;
-	int ret = 0;
-	int broaden = 0;
-
-	/* No filters installed isn't necessarily an error. */
-	if (!sf)
-		return ret;
-
-	do {
-		errno = 0;
-		ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
-			    sf->nr, broaden ? "1" : sf->filter);
-		if (ret) {
-			switch (errno) {
-			case ENOSYS:
-				/* TODO(wad) make this a config option */
-				if (broaden)
-					die("CONFIG_SECCOMP_FILTER is not"
-					    "supported by your kernel");
-				warn("missing CONFIG_FTRACE_SYSCALLS; relaxing"
-				     "the filter for %d", sf->nr);
-				broaden = 1;
-				continue;
-			case E2BIG:
-				warn("seccomp filter too long: %d", sf->nr);
-				pdie("filter too long");
-			case ENOSPC:
-				pdie("too many seccomp filters");
-			case EPERM:
-				warn("syscall filter disallowed for %d",
-				     sf->nr);
-				pdie("failed to install seccomp filter");
-			case EINVAL:
-				warn("seccomp filter or call method is"
-				     " invalid. %d:'%s'", sf->nr, sf->filter);
-			default:
-				pdie("failed to install seccomp filter");
-			}
-		}
-		sf = sf->next;
-		broaden = 0;
-	} while (sf != j->filters);
-	return ret;
-}
-
 void API minijail_enter(const struct minijail *j)
 {
 	if (j->flags.pids)
 		die("tried to enter a pid-namespaced jail;"
 		    "try minijail_run()?");
 
-	if (j->flags.seccomp_filter && setup_seccomp_filters(j))
-		pdie("failed to configure seccomp filters");
-
 	if (j->flags.usergroups && !j->user)
 		die("usergroup inheritance without username");
 
@@ -796,8 +664,15 @@
 	 * seccomp has to come last since it cuts off all the other
 	 * privilege-dropping syscalls :)
 	 */
-	if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
-		pdie("prctl(PR_SET_SECCOMP, 13)");
+	if (j->flags.seccomp_filter) {
+		/* TODO(jorgelo): document call to PR_SET_NO_NEW_PRIVS. */
+		if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+			pdie("prctl(PR_SET_NO_NEW_PRIVS)");
+		}
+		if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, j->filter_prog)) {
+			pdie("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER)");
+		}
+	}
 
 	if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
 		pdie("prctl(PR_SET_SECCOMP)");
@@ -1028,15 +903,9 @@
 
 void API minijail_destroy(struct minijail *j)
 {
-	struct seccomp_filter *f = j->filters;
-	/* Unlink the tail and head */
-	if (f)
-		f->prev->next = NULL;
-	while (f) {
-		struct seccomp_filter *next = f->next;
-		free(f->filter);
-		free(f);
-		f = next;
+	if (j->flags.seccomp_filter && j->filter_prog) {
+		free(j->filter_prog->filter);
+		free(j->filter_prog);
 	}
 	while (j->bindings_head) {
 		struct binding *b = j->bindings_head;