blob: 7ddeb6b63ed48140cc4ae55f8d3795587c8d76a2 [file] [log] [blame]
Elly Jonescd7a9042011-07-22 13:56:51 -04001/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Will Drewry32ac9f52011-08-18 21:36:27 -05008#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -04009#include <errno.h>
10#include <grp.h>
11#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050012#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <linux/capability.h>
14#include <linux/securebits.h>
15#include <pwd.h>
16#include <sched.h>
17#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050018#include <stdarg.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040019#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <syscall.h>
23#include <sys/capability.h>
24#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050025#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040026#include <sys/prctl.h>
27#include <sys/wait.h>
28#include <syslog.h>
29#include <unistd.h>
30
31#include "libminijail.h"
Will Drewry32ac9f52011-08-18 21:36:27 -050032#include "libsyscalls.h"
Elly Jonescd7a9042011-07-22 13:56:51 -040033#include "libminijail-private.h"
34
Will Drewry32ac9f52011-08-18 21:36:27 -050035/* Until these are reliably available in linux/prctl.h */
36#ifndef PR_SET_SECCOMP_FILTER
Elly Jonese1749eb2011-10-07 13:54:59 -040037# define PR_SECCOMP_FILTER_SYSCALL 0
38# define PR_SECCOMP_FILTER_EVENT 1
39# define PR_GET_SECCOMP_FILTER 35
40# define PR_SET_SECCOMP_FILTER 36
41# define PR_CLEAR_SECCOMP_FILTER 37
Will Drewry32ac9f52011-08-18 21:36:27 -050042#endif
43
Will Drewry32ac9f52011-08-18 21:36:27 -050044#define die(_msg, ...) do { \
Elly Jonese1749eb2011-10-07 13:54:59 -040045 syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
46 abort(); \
Will Drewry32ac9f52011-08-18 21:36:27 -050047} while (0)
Elly Jonescd7a9042011-07-22 13:56:51 -040048
Will Drewry32ac9f52011-08-18 21:36:27 -050049#define pdie(_msg, ...) \
Elly Jonese1749eb2011-10-07 13:54:59 -040050 die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
Will Drewry32ac9f52011-08-18 21:36:27 -050051
52#define warn(_msg, ...) \
Elly Jonese1749eb2011-10-07 13:54:59 -040053 syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
Elly Jonescd7a9042011-07-22 13:56:51 -040054
Will Drewryf89aef52011-09-16 16:48:57 -050055struct seccomp_filter {
Elly Jonese1749eb2011-10-07 13:54:59 -040056 int nr;
57 char *filter;
58 struct seccomp_filter *next, *prev;
Will Drewryf89aef52011-09-16 16:48:57 -050059};
60
Elly Jones51a5b6c2011-10-12 19:09:26 -040061struct binding {
62 char *src;
63 char *dest;
64 int writeable;
65 struct binding *next;
66};
67
Will Drewryf89aef52011-09-16 16:48:57 -050068struct minijail {
Elly Jonese1749eb2011-10-07 13:54:59 -040069 struct {
70 int uid:1;
71 int gid:1;
72 int caps:1;
73 int vfs:1;
74 int pids:1;
75 int seccomp:1;
76 int readonly:1;
77 int usergroups:1;
78 int ptrace:1;
79 int seccomp_filter:1;
Elly Jones51a5b6c2011-10-12 19:09:26 -040080 int chroot:1;
Elly Jonese1749eb2011-10-07 13:54:59 -040081 } flags;
82 uid_t uid;
83 gid_t gid;
84 gid_t usergid;
85 char *user;
86 uint64_t caps;
87 pid_t initpid;
88 int filter_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -040089 int binding_count;
90 char *chrootdir;
Elly Jonese1749eb2011-10-07 13:54:59 -040091 struct seccomp_filter *filters;
Elly Jones51a5b6c2011-10-12 19:09:26 -040092 struct binding *bindings_head;
93 struct binding *bindings_tail;
Will Drewryf89aef52011-09-16 16:48:57 -050094};
95
Elly Jonese1749eb2011-10-07 13:54:59 -040096struct minijail *minijail_new(void)
97{
Elly Jones51a5b6c2011-10-12 19:09:26 -040098 return calloc(1, sizeof(struct minijail));
Elly Jonescd7a9042011-07-22 13:56:51 -040099}
100
Elly Jonese1749eb2011-10-07 13:54:59 -0400101void minijail_change_uid(struct minijail *j, uid_t uid)
102{
103 if (uid == 0)
104 die("useless change to uid 0");
105 j->uid = uid;
106 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400107}
108
Elly Jonese1749eb2011-10-07 13:54:59 -0400109void minijail_change_gid(struct minijail *j, gid_t gid)
110{
111 if (gid == 0)
112 die("useless change to gid 0");
113 j->gid = gid;
114 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400115}
116
Elly Jonese1749eb2011-10-07 13:54:59 -0400117int minijail_change_user(struct minijail *j, const char *user)
118{
119 char *buf = NULL;
120 struct passwd pw;
121 struct passwd *ppw = NULL;
122 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
123 if (sz == -1)
124 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400125
Elly Jonese1749eb2011-10-07 13:54:59 -0400126 /* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
127 * the maximum needed size of the buffer, so we don't have to search.
128 */
129 buf = malloc(sz);
130 if (!buf)
131 return -ENOMEM;
132 getpwnam_r(user, &pw, buf, sz, &ppw);
133 free(buf);
134 if (!ppw)
135 return -errno;
136 minijail_change_uid(j, ppw->pw_uid);
137 j->user = strdup(user);
138 if (!j->user)
139 return -ENOMEM;
140 j->usergid = ppw->pw_gid;
141 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400142}
143
Elly Jonese1749eb2011-10-07 13:54:59 -0400144int minijail_change_group(struct minijail *j, const char *group)
145{
146 char *buf = NULL;
147 struct group gr;
148 struct group *pgr = NULL;
149 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
150 if (sz == -1)
151 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400152
Elly Jonese1749eb2011-10-07 13:54:59 -0400153 /* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
154 * the maximum needed size of the buffer, so we don't have to search.
155 */
156 buf = malloc(sz);
157 if (!buf)
158 return -ENOMEM;
159 getgrnam_r(group, &gr, buf, sz, &pgr);
160 free(buf);
161 if (!pgr)
162 return -errno;
163 minijail_change_gid(j, pgr->gr_gid);
164 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400165}
166
Elly Jonese1749eb2011-10-07 13:54:59 -0400167void minijail_use_seccomp(struct minijail *j)
168{
169 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400170}
171
Elly Jonese1749eb2011-10-07 13:54:59 -0400172void minijail_use_seccomp_filter(struct minijail *j)
173{
174 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500175}
176
Elly Jonese1749eb2011-10-07 13:54:59 -0400177void minijail_use_caps(struct minijail *j, uint64_t capmask)
178{
179 j->caps = capmask;
180 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400181}
182
Elly Jonese1749eb2011-10-07 13:54:59 -0400183void minijail_namespace_vfs(struct minijail *j)
184{
185 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400186}
187
Elly Jonese1749eb2011-10-07 13:54:59 -0400188void minijail_namespace_pids(struct minijail *j)
189{
190 j->flags.pids = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400191}
192
Elly Jonese1749eb2011-10-07 13:54:59 -0400193void minijail_remount_readonly(struct minijail *j)
194{
195 j->flags.vfs = 1;
196 j->flags.readonly = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400197}
198
Elly Jonese1749eb2011-10-07 13:54:59 -0400199void minijail_inherit_usergroups(struct minijail *j)
200{
201 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400202}
203
Elly Jonese1749eb2011-10-07 13:54:59 -0400204void minijail_disable_ptrace(struct minijail *j)
205{
206 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400207}
208
Elly Jones51a5b6c2011-10-12 19:09:26 -0400209int minijail_enter_chroot(struct minijail *j, const char *dir) {
210 if (j->chrootdir)
211 return -EINVAL;
212 j->chrootdir = strdup(dir);
213 if (!j->chrootdir)
214 return -ENOMEM;
215 j->flags.chroot = 1;
216 return 0;
217}
218
219int minijail_bind(struct minijail *j, const char *src, const char *dest,
220 int writeable) {
221 struct binding *b;
222
223 if (*dest != '/')
224 return -EINVAL;
225 b = calloc(1, sizeof(*b));
226 if (!b)
227 return -ENOMEM;
228 b->dest = strdup(dest);
229 if (!b->dest)
230 goto error;
231 b->src = strdup(src);
232 if (!b->src)
233 goto error;
234 b->writeable = writeable;
235
236 syslog(LOG_INFO, "libminijail: bind %s -> %s", src, dest);
237
238 /* Force vfs namespacing so the bind mounts don't leak out into the
239 * containing vfs namespace.
240 */
241 minijail_namespace_vfs(j);
242
243 if (j->bindings_tail)
244 j->bindings_tail->next = b;
245 else
246 j->bindings_head = b;
247 j->bindings_tail = b;
248 j->binding_count++;
249
250 return 0;
251
252error:
253 free(b->src);
254 free(b->dest);
255 free(b);
256 return -ENOMEM;
257}
258
Elly Jonese1749eb2011-10-07 13:54:59 -0400259int minijail_add_seccomp_filter(struct minijail *j, int nr, const char *filter)
260{
261 struct seccomp_filter *sf;
262 if (!filter || nr < 0)
263 return -EINVAL;
Will Drewry32ac9f52011-08-18 21:36:27 -0500264
Elly Jonese1749eb2011-10-07 13:54:59 -0400265 sf = malloc(sizeof(*sf));
266 if (!sf)
267 return -ENOMEM;
268 sf->nr = nr;
269 sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
270 if (!sf->filter) {
271 free(sf);
272 return -ENOMEM;
273 }
Will Drewry32ac9f52011-08-18 21:36:27 -0500274
Elly Jonese1749eb2011-10-07 13:54:59 -0400275 j->filter_count++;
Will Drewryf89aef52011-09-16 16:48:57 -0500276
Elly Jonese1749eb2011-10-07 13:54:59 -0400277 if (!j->filters) {
278 j->filters = sf;
279 sf->next = sf;
280 sf->prev = sf;
281 return 0;
282 }
283 sf->next = j->filters;
284 sf->prev = j->filters->prev;
285 sf->prev->next = sf;
286 j->filters->prev = sf;
287 return 0;
Will Drewry32ac9f52011-08-18 21:36:27 -0500288}
289
Elly Jonese1749eb2011-10-07 13:54:59 -0400290int minijail_lookup_syscall(const char *name)
291{
292 const struct syscall_entry *entry = syscall_table;
293 for (; entry->name && entry->nr >= 0; ++entry)
294 if (!strcmp(entry->name, name))
295 return entry->nr;
296 return -1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500297}
298
Elly Jonese1749eb2011-10-07 13:54:59 -0400299static char *strip(char *s)
300{
301 char *end;
302 while (*s && isblank(*s))
303 s++;
304 end = s + strlen(s) - 1;
305 while (*end && (isblank(*end) || *end == '\n'))
306 end--;
307 *(end + 1) = '\0';
308 return s;
Will Drewry32ac9f52011-08-18 21:36:27 -0500309}
310
Elly Jonese1749eb2011-10-07 13:54:59 -0400311void minijail_parse_seccomp_filters(struct minijail *j, const char *path)
312{
313 FILE *file = fopen(path, "r");
314 char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
Ben Chan1d697932011-10-14 10:53:32 -0700315 int count = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -0400316 if (!file)
317 pdie("failed to open seccomp filters file");
Will Drewry32ac9f52011-08-18 21:36:27 -0500318
Elly Jonese1749eb2011-10-07 13:54:59 -0400319 /* Format is simple:
320 * syscall_name<COLON><FILTER STRING>[\n|EOF]
321 * #...comment...
322 * <empty line?
323 */
324 while (fgets(line, sizeof(line), file)) {
325 char *filter = line;
326 char *name = strsep(&filter, ":");
327 char *name_end = NULL;
328 int nr = -1;
Ben Chan1d697932011-10-14 10:53:32 -0700329 count++;
Will Drewry32ac9f52011-08-18 21:36:27 -0500330
Ben Chan1d697932011-10-14 10:53:32 -0700331 /* Allow comment lines */
332 if (*name == '#')
333 continue;
Will Drewry32ac9f52011-08-18 21:36:27 -0500334
Elly Jonese1749eb2011-10-07 13:54:59 -0400335 name = strip(name);
Will Drewry32ac9f52011-08-18 21:36:27 -0500336
Elly Jonese1749eb2011-10-07 13:54:59 -0400337 if (!filter) {
338 if (strlen(name))
339 die("invalid filter on line %d", count);
340 /* Allow empty lines */
341 continue;
342 }
Will Drewry32ac9f52011-08-18 21:36:27 -0500343
Elly Jonese1749eb2011-10-07 13:54:59 -0400344 filter = strip(filter);
Will Drewry32ac9f52011-08-18 21:36:27 -0500345
Elly Jonese1749eb2011-10-07 13:54:59 -0400346 /* Take direct syscall numbers */
347 nr = strtol(name, &name_end, 0);
348 /* Or fail-over to using names */
349 if (*name_end != '\0')
350 nr = minijail_lookup_syscall(name);
351 if (nr < 0)
352 die("syscall '%s' unknown", name);
Will Drewry32ac9f52011-08-18 21:36:27 -0500353
Elly Jonese1749eb2011-10-07 13:54:59 -0400354 if (minijail_add_seccomp_filter(j, nr, filter))
355 pdie("failed to add filter for syscall '%s'", name);
356 }
357 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500358}
359
Will Drewryf89aef52011-09-16 16:48:57 -0500360struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400361 size_t available;
362 size_t total;
363 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500364};
365
366static void marshal_state_init(struct marshal_state *state,
Elly Jonese1749eb2011-10-07 13:54:59 -0400367 char *buf, size_t available)
368{
369 state->available = available;
370 state->buf = buf;
371 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500372}
373
374static void marshal_append(struct marshal_state *state,
Elly Jonese1749eb2011-10-07 13:54:59 -0400375 char *src, size_t length)
376{
377 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500378
Elly Jonese1749eb2011-10-07 13:54:59 -0400379 /* Up to |available| will be written. */
380 if (copy_len) {
381 memcpy(state->buf, src, copy_len);
382 state->buf += copy_len;
383 state->available -= copy_len;
384 }
385 /* |total| will contain the expected length. */
386 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500387}
388
389static void minijail_marshal_helper(struct marshal_state *state,
Elly Jonese1749eb2011-10-07 13:54:59 -0400390 const struct minijail *j)
391{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400392 struct binding *b = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -0400393 marshal_append(state, (char *)j, sizeof(*j));
394 if (j->user)
395 marshal_append(state, j->user, strlen(j->user) + 1);
Elly Jones51a5b6c2011-10-12 19:09:26 -0400396 if (j->chrootdir)
397 marshal_append(state, j->chrootdir, strlen(j->chrootdir) + 1);
Elly Jonese1749eb2011-10-07 13:54:59 -0400398 if (j->flags.seccomp_filter && j->filters) {
399 struct seccomp_filter *f = j->filters;
400 do {
401 marshal_append(state, (char *)&f->nr, sizeof(f->nr));
402 marshal_append(state, f->filter, strlen(f->filter) + 1);
403 f = f->next;
404 } while (f != j->filters);
405 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400406 for (b = j->bindings_head; b; b = b->next) {
407 marshal_append(state, b->src, strlen(b->src) + 1);
408 marshal_append(state, b->dest, strlen(b->dest) + 1);
409 marshal_append(state, (char *)&b->writeable, sizeof(b->writeable));
410 }
Will Drewryf89aef52011-09-16 16:48:57 -0500411}
412
Elly Jonese1749eb2011-10-07 13:54:59 -0400413size_t minijail_size(const struct minijail *j)
414{
415 struct marshal_state state;
416 marshal_state_init(&state, NULL, 0);
417 minijail_marshal_helper(&state, j);
418 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500419}
420
Elly Jonese1749eb2011-10-07 13:54:59 -0400421int minijail_marshal(const struct minijail *j, char *buf, size_t available)
422{
423 struct marshal_state state;
424 marshal_state_init(&state, buf, available);
425 minijail_marshal_helper(&state, j);
426 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500427}
428
Elly Jones51a5b6c2011-10-12 19:09:26 -0400429/* consumebytes: consumes @length bytes from a buffer @buf of length @buflength
430 * @length Number of bytes to consume
431 * @buf Buffer to consume from
432 * @buflength Size of @buf
433 *
434 * Returns a pointer to the base of the bytes, or NULL for errors.
435 */
436static void *consumebytes(size_t length, char **buf, size_t *buflength) {
437 char *p = *buf;
438 if (length > *buflength)
439 return NULL;
440 *buf += length;
441 *buflength -= length;
442 return p;
443}
444
445/* consumestr: consumes a C string from a buffer @buf of length @length
446 * @buf Buffer to consume
447 * @length Length of buffer
448 *
449 * Returns a pointer to the base of the string, or NULL for errors.
450 */
451static char *consumestr(char **buf, size_t *buflength) {
452 size_t len = strnlen(*buf, *buflength);
453 if (len == *buflength)
454 /* There's no null-terminator */
455 return NULL;
456 return consumebytes(len + 1, buf, buflength);
457}
458
Elly Jonese1749eb2011-10-07 13:54:59 -0400459int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
460{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400461 int i;
462 int count;
Will Drewrybee7ba72011-10-21 20:47:01 -0500463 int ret = -EINVAL;
464
Elly Jonese1749eb2011-10-07 13:54:59 -0400465 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -0500466 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -0400467 memcpy((void *)j, serialized, sizeof(*j));
468 serialized += sizeof(*j);
469 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500470
Will Drewrybee7ba72011-10-21 20:47:01 -0500471 /* Potentially stale pointers not used as signals. */
472 j->bindings_head = NULL;
473 j->bindings_tail = NULL;
474 j->filters = NULL;
475
Elly Jonese1749eb2011-10-07 13:54:59 -0400476 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -0400477 char *user = consumestr(&serialized, &length);
478 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -0500479 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400480 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -0500481 if (!j->user)
482 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -0400483 }
Will Drewryf89aef52011-09-16 16:48:57 -0500484
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400485 if (j->chrootdir) { /* stale pointer */
486 char *chrootdir = consumestr(&serialized, &length);
487 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -0500488 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400489 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -0500490 if (!j->chrootdir)
491 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -0400492 }
493
Elly Jonese1749eb2011-10-07 13:54:59 -0400494 if (j->flags.seccomp_filter && j->filter_count) {
Elly Jones51a5b6c2011-10-12 19:09:26 -0400495 count = j->filter_count;
Elly Jonese1749eb2011-10-07 13:54:59 -0400496 /* Let add_seccomp_filter recompute the value. */
497 j->filter_count = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -0400498 for (; count > 0; --count) {
Elly Jones51a5b6c2011-10-12 19:09:26 -0400499 int *nr = (int *)consumebytes(sizeof(*nr), &serialized,
500 &length);
Elly Jonese1749eb2011-10-07 13:54:59 -0400501 char *filter;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400502 if (!nr)
Will Drewrybee7ba72011-10-21 20:47:01 -0500503 goto bad_filters;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400504 filter = consumestr(&serialized, &length);
505 if (!filter)
Will Drewrybee7ba72011-10-21 20:47:01 -0500506 goto bad_filters;
Elly Jonese1749eb2011-10-07 13:54:59 -0400507 if (minijail_add_seccomp_filter(j, *nr, filter))
Will Drewrybee7ba72011-10-21 20:47:01 -0500508 goto bad_filters;
Elly Jonese1749eb2011-10-07 13:54:59 -0400509 }
510 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400511
512 count = j->binding_count;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400513 j->binding_count = 0;
514 for (i = 0; i < count; ++i) {
515 int *writeable;
516 const char *dest;
517 const char *src = consumestr(&serialized, &length);
518 if (!src)
Will Drewrybee7ba72011-10-21 20:47:01 -0500519 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400520 dest = consumestr(&serialized, &length);
521 if (!dest)
Will Drewrybee7ba72011-10-21 20:47:01 -0500522 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400523 writeable = consumebytes(sizeof(*writeable), &serialized, &length);
524 if (!writeable)
Will Drewrybee7ba72011-10-21 20:47:01 -0500525 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400526 if (minijail_bind(j, src, dest, *writeable))
Will Drewrybee7ba72011-10-21 20:47:01 -0500527 goto bad_bindings;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400528 }
529
Elly Jonese1749eb2011-10-07 13:54:59 -0400530 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -0500531
532bad_bindings:
533bad_filters:
534 if (j->chrootdir)
535 free(j->chrootdir);
536bad_chrootdir:
537 if (j->user)
538 free(j->user);
539clear_pointers:
540 j->user = NULL;
541 j->chrootdir = NULL;
542out:
543 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -0500544}
545
Elly Jonese1749eb2011-10-07 13:54:59 -0400546void minijail_preenter(struct minijail *j)
547{
548 /* Strip out options which are minijail_run() only. */
549 j->flags.vfs = 0;
550 j->flags.readonly = 0;
551 j->flags.pids = 0;
Will Drewryfe4a3722011-09-16 14:50:50 -0500552}
553
Elly Jonese1749eb2011-10-07 13:54:59 -0400554void minijail_preexec(struct minijail *j)
555{
556 int vfs = j->flags.vfs;
557 int readonly = j->flags.readonly;
558 if (j->user)
559 free(j->user);
560 j->user = NULL;
561 memset(&j->flags, 0, sizeof(j->flags));
562 /* Now restore anything we meant to keep. */
563 j->flags.vfs = vfs;
564 j->flags.readonly = readonly;
565 /* Note, pidns will already have been used before this call. */
Will Drewry2ddaad02011-09-16 11:36:08 -0500566}
567
Elly Jones51a5b6c2011-10-12 19:09:26 -0400568/* bind_one: Applies bindings from @b for @j, recursing as needed.
569 * @j Minijail these bindings are for
570 * @b Head of list of bindings
571 *
572 * Returns 0 for success.
573 */
574static int bind_one(const struct minijail *j, struct binding *b) {
575 int ret = 0;
576 char *dest = NULL;
577 int mflags = MS_BIND | (b->writeable ? 0 : MS_RDONLY);
578 if (ret)
579 return ret;
580 /* dest has a leading "/" */
581 if (asprintf(&dest, "%s%s", j->chrootdir, b->dest) < 0)
582 return -ENOMEM;
583 ret = mount(b->src, dest, NULL, mflags, NULL);
584 if (ret)
585 pdie("bind: %s -> %s", b->src, dest);
586 free(dest);
587 if (b->next)
588 return bind_one(j, b->next);
589 return ret;
590}
591
592static int enter_chroot(const struct minijail *j) {
593 int ret;
594 if (j->bindings_head && (ret = bind_one(j, j->bindings_head)))
595 return ret;
596
597 if (chroot(j->chrootdir))
598 return -errno;
599
600 if (chdir("/"))
601 return -errno;
602
603 return 0;
604}
605
Elly Jonese1749eb2011-10-07 13:54:59 -0400606static int remount_readonly(void)
607{
608 const char *kProcPath = "/proc";
609 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
610 /* Right now, we're holding a reference to our parent's old mount of
611 * /proc in our namespace, which means using MS_REMOUNT here would
612 * mutate our parent's mount as well, even though we're in a VFS
613 * namespace (!). Instead, remove their mount from our namespace
614 * and make our own.
615 */
616 if (umount(kProcPath))
617 return -errno;
618 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
619 return -errno;
620 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400621}
622
Elly Jonese1749eb2011-10-07 13:54:59 -0400623static void drop_caps(const struct minijail *j)
624{
625 cap_t caps = cap_get_proc();
626 cap_value_t raise_flag[1];
627 unsigned int i;
628 if (!caps)
629 die("can't get process caps");
630 if (cap_clear_flag(caps, CAP_INHERITABLE))
631 die("can't clear inheritable caps");
632 if (cap_clear_flag(caps, CAP_EFFECTIVE))
633 die("can't clear effective caps");
634 if (cap_clear_flag(caps, CAP_PERMITTED))
635 die("can't clear permitted caps");
636 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
637 if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
638 continue;
639 raise_flag[0] = i;
640 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
641 die("can't add effective cap");
642 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
643 die("can't add permitted cap");
644 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
645 die("can't add inheritable cap");
646 }
647 if (cap_set_proc(caps))
648 die("can't apply cleaned capset");
649 cap_free(caps);
650 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
651 if (j->caps & (1 << i))
652 continue;
653 if (prctl(PR_CAPBSET_DROP, i))
654 pdie("prctl(PR_CAPBSET_DROP)");
655 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400656}
657
Elly Jonese1749eb2011-10-07 13:54:59 -0400658static int setup_seccomp_filters(const struct minijail *j)
659{
660 const struct seccomp_filter *sf = j->filters;
661 int ret = 0;
662 int broaden = 0;
Will Drewry32ac9f52011-08-18 21:36:27 -0500663
Elly Jonese1749eb2011-10-07 13:54:59 -0400664 /* No filters installed isn't necessarily an error. */
665 if (!sf)
666 return ret;
Will Drewry32ac9f52011-08-18 21:36:27 -0500667
Elly Jonese1749eb2011-10-07 13:54:59 -0400668 do {
669 errno = 0;
670 ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
671 sf->nr, broaden ? "1" : sf->filter);
672 if (ret) {
673 switch (errno) {
674 case ENOSYS:
675 /* TODO(wad) make this a config option */
676 if (broaden)
677 die("CONFIG_SECCOMP_FILTER is not"
678 "supported by your kernel");
679 warn("missing CONFIG_FTRACE_SYSCALLS; relaxing"
680 "the filter for %d", sf->nr);
681 broaden = 1;
682 continue;
683 case E2BIG:
684 warn("seccomp filter too long: %d", sf->nr);
685 pdie("filter too long");
686 case ENOSPC:
687 pdie("too many seccomp filters");
688 case EPERM:
689 warn("syscall filter disallowed for %d",
690 sf->nr);
691 pdie("failed to install seccomp filter");
692 case EINVAL:
693 warn("seccomp filter or call method is"
694 " invalid. %d:'%s'", sf->nr, sf->filter);
695 default:
696 pdie("failed to install seccomp filter");
697 }
698 }
699 sf = sf->next;
700 broaden = 0;
701 } while (sf != j->filters);
702 return ret;
Will Drewry32ac9f52011-08-18 21:36:27 -0500703}
704
Elly Jonese1749eb2011-10-07 13:54:59 -0400705void minijail_enter(const struct minijail *j)
706{
707 if (j->flags.pids)
708 die("tried to enter a pid-namespaced jail;"
709 "try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -0400710
Elly Jonese1749eb2011-10-07 13:54:59 -0400711 if (j->flags.seccomp_filter && setup_seccomp_filters(j))
712 pdie("failed to configure seccomp filters");
Will Drewry32ac9f52011-08-18 21:36:27 -0500713
Elly Jonese1749eb2011-10-07 13:54:59 -0400714 if (j->flags.usergroups && !j->user)
715 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -0400716
Elly Jonese1749eb2011-10-07 13:54:59 -0400717 /* We can't recover from failures if we've dropped privileges partially,
718 * so we don't even try. If any of our operations fail, we abort() the
719 * entire process.
720 */
721 if (j->flags.vfs && unshare(CLONE_NEWNS))
722 pdie("unshare");
Elly Jonescd7a9042011-07-22 13:56:51 -0400723
Elly Jones51a5b6c2011-10-12 19:09:26 -0400724 if (j->flags.chroot && enter_chroot(j))
725 pdie("chroot");
726
Elly Jonese1749eb2011-10-07 13:54:59 -0400727 if (j->flags.readonly && remount_readonly())
728 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -0400729
Elly Jonese1749eb2011-10-07 13:54:59 -0400730 if (j->flags.caps) {
731 /* POSIX capabilities are a bit tricky. If we drop our
732 * capability to change uids, our attempt to use setuid()
733 * below will fail. Hang on to root caps across setuid(), then
734 * lock securebits.
735 */
736 if (prctl(PR_SET_KEEPCAPS, 1))
737 pdie("prctl(PR_SET_KEEPCAPS)");
738 if (prctl
739 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
740 pdie("prctl(PR_SET_SECUREBITS)");
741 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400742
Elly Jonese1749eb2011-10-07 13:54:59 -0400743 if (j->flags.usergroups) {
744 if (initgroups(j->user, j->usergid))
745 pdie("initgroups");
746 } else {
747 /* Only attempt to clear supplemental groups if we are changing
748 * users. */
749 if ((j->uid || j->gid) && setgroups(0, NULL))
750 pdie("setgroups");
751 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400752
Elly Jonese1749eb2011-10-07 13:54:59 -0400753 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
754 pdie("setresgid");
Elly Jonescd7a9042011-07-22 13:56:51 -0400755
Elly Jonese1749eb2011-10-07 13:54:59 -0400756 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
757 pdie("setresuid");
Elly Jonescd7a9042011-07-22 13:56:51 -0400758
Elly Jonese1749eb2011-10-07 13:54:59 -0400759 if (j->flags.caps)
760 drop_caps(j);
Elly Jonescd7a9042011-07-22 13:56:51 -0400761
Elly Jonese1749eb2011-10-07 13:54:59 -0400762 /* seccomp has to come last since it cuts off all the other
763 * privilege-dropping syscalls :)
764 */
765 if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
766 pdie("prctl(PR_SET_SECCOMP, 13)");
Will Drewry32ac9f52011-08-18 21:36:27 -0500767
Elly Jonese1749eb2011-10-07 13:54:59 -0400768 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
769 pdie("prctl(PR_SET_SECCOMP)");
Elly Jonescd7a9042011-07-22 13:56:51 -0400770}
771
772static int init_exitstatus = 0;
773
Elly Jonese1749eb2011-10-07 13:54:59 -0400774static void init_term(int __attribute__ ((unused)) sig)
775{
776 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -0400777}
778
Elly Jonese1749eb2011-10-07 13:54:59 -0400779static int init(pid_t rootpid)
780{
781 pid_t pid;
782 int status;
783 /* so that we exit with the right status */
784 signal(SIGTERM, init_term);
785 /* TODO(wad) self jail with seccomp_filters here. */
786 while ((pid = wait(&status)) > 0) {
787 /* This loop will only end when either there are no processes
788 * left inside our pid namespace or we get a signal.
789 */
790 if (pid == rootpid)
791 init_exitstatus = status;
792 }
793 if (!WIFEXITED(init_exitstatus))
794 _exit(MINIJAIL_ERR_INIT);
795 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -0400796}
797
Elly Jonese1749eb2011-10-07 13:54:59 -0400798int minijail_from_fd(int fd, struct minijail *j)
799{
800 size_t sz = 0;
801 size_t bytes = read(fd, &sz, sizeof(sz));
802 char *buf;
803 int r;
804 if (sizeof(sz) != bytes)
805 return -EINVAL;
806 if (sz > USHRT_MAX) /* Arbitrary sanity check */
807 return -E2BIG;
808 buf = malloc(sz);
809 if (!buf)
810 return -ENOMEM;
811 bytes = read(fd, buf, sz);
812 if (bytes != sz) {
813 free(buf);
814 return -EINVAL;
815 }
816 r = minijail_unmarshal(j, buf, sz);
817 free(buf);
818 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500819}
820
Elly Jonese1749eb2011-10-07 13:54:59 -0400821int minijail_to_fd(struct minijail *j, int fd)
822{
823 char *buf;
824 size_t sz = minijail_size(j);
825 ssize_t written;
826 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -0400827
Elly Jonese1749eb2011-10-07 13:54:59 -0400828 if (!sz)
829 return -EINVAL;
830 buf = malloc(sz);
831 r = minijail_marshal(j, buf, sz);
832 if (r) {
833 free(buf);
834 return r;
835 }
836 /* Sends [size][minijail]. */
837 written = write(fd, &sz, sizeof(sz));
838 if (written != sizeof(sz)) {
839 free(buf);
840 return -EFAULT;
841 }
842 written = write(fd, buf, sz);
843 if (written < 0 || (size_t) written != sz) {
844 free(buf);
845 return -EFAULT;
846 }
847 free(buf);
848 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500849}
Elly Jonescd7a9042011-07-22 13:56:51 -0400850
Elly Jonese1749eb2011-10-07 13:54:59 -0400851static int setup_preload(void)
852{
853 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
854 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
855 if (!newenv)
856 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -0400857
Elly Jonese1749eb2011-10-07 13:54:59 -0400858 /* Only insert a separating space if we have something to separate... */
859 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
860 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -0400861
Elly Jonese1749eb2011-10-07 13:54:59 -0400862 /* setenv() makes a copy of the string we give it */
863 setenv(kLdPreloadEnvVar, newenv, 1);
864 free(newenv);
865 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400866}
867
Elly Jonese1749eb2011-10-07 13:54:59 -0400868static int setup_pipe(int fds[2])
869{
870 int r = pipe(fds);
871 char fd_buf[11];
872 if (r)
873 return r;
874 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
875 if (r <= 0)
876 return -EINVAL;
877 setenv(kFdEnvVar, fd_buf, 1);
878 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500879}
880
Elly Jonese1749eb2011-10-07 13:54:59 -0400881int minijail_run(struct minijail *j, const char *filename, char *const argv[])
882{
883 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
884 char *oldenv, *oldenv_copy = NULL;
885 pid_t child_pid;
886 int pipe_fds[2];
887 int ret;
Ben Chan541c7e52011-08-26 14:55:53 -0700888
Elly Jonese1749eb2011-10-07 13:54:59 -0400889 oldenv = getenv(kLdPreloadEnvVar);
890 if (oldenv) {
891 oldenv_copy = strdup(oldenv);
892 if (!oldenv_copy)
893 return -ENOMEM;
894 }
Will Drewryf89aef52011-09-16 16:48:57 -0500895
Elly Jonese1749eb2011-10-07 13:54:59 -0400896 if (setup_preload())
897 return -EFAULT;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500898
Elly Jonese1749eb2011-10-07 13:54:59 -0400899 /* Before we fork(2) and execve(2) the child process, we need to open
900 * a pipe(2) to send the minijail configuration over.
901 */
902 if (setup_pipe(pipe_fds))
903 return -EFAULT;
Elly Jonescd7a9042011-07-22 13:56:51 -0400904
Elly Jonese1749eb2011-10-07 13:54:59 -0400905 child_pid = syscall(SYS_clone, pidns | SIGCHLD, NULL);
906 if (child_pid < 0) {
907 free(oldenv_copy);
908 return child_pid;
909 }
Will Drewryf89aef52011-09-16 16:48:57 -0500910
Elly Jonese1749eb2011-10-07 13:54:59 -0400911 if (child_pid) {
912 /* Restore parent's LD_PRELOAD. */
913 if (oldenv_copy) {
914 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
915 free(oldenv_copy);
916 } else {
917 unsetenv(kLdPreloadEnvVar);
918 }
919 unsetenv(kFdEnvVar);
920 j->initpid = child_pid;
921 close(pipe_fds[0]); /* read endpoint */
922 ret = minijail_to_fd(j, pipe_fds[1]);
923 close(pipe_fds[1]); /* write endpoint */
924 if (ret) {
925 kill(j->initpid, SIGKILL);
926 die("failed to send marshalled minijail");
927 }
928 return 0;
929 }
930 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -0700931
Elly Jonese1749eb2011-10-07 13:54:59 -0400932 /* Drop everything that cannot be inherited across execve. */
933 minijail_preexec(j);
934 /* Jail this process and its descendants... */
935 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -0400936
Elly Jonese1749eb2011-10-07 13:54:59 -0400937 if (pidns) {
938 /* pid namespace: this process will become init inside the new
939 * namespace, so fork off a child to actually run the program
940 * (we don't want all programs we might exec to have to know
941 * how to be init).
942 */
943 child_pid = fork();
944 if (child_pid < 0)
945 _exit(child_pid);
946 else if (child_pid > 0)
947 init(child_pid); /* never returns */
948 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400949
Elly Jonese1749eb2011-10-07 13:54:59 -0400950 /* If we aren't pid-namespaced:
951 * calling process
952 * -> execve()-ing process
953 * If we are:
954 * calling process
955 * -> init()-ing process
956 * -> execve()-ing process
957 */
958 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -0400959}
960
Elly Jonese1749eb2011-10-07 13:54:59 -0400961int minijail_kill(struct minijail *j)
962{
963 int st;
964 if (kill(j->initpid, SIGTERM))
965 return -errno;
966 if (waitpid(j->initpid, &st, 0) < 0)
967 return -errno;
968 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -0400969}
970
Elly Jonese1749eb2011-10-07 13:54:59 -0400971int minijail_wait(struct minijail *j)
972{
973 int st;
974 if (waitpid(j->initpid, &st, 0) < 0)
975 return -errno;
976 if (!WIFEXITED(st))
977 return MINIJAIL_ERR_JAIL;
978 return WEXITSTATUS(st);
Elly Jonescd7a9042011-07-22 13:56:51 -0400979}
980
Elly Jonese1749eb2011-10-07 13:54:59 -0400981void minijail_destroy(struct minijail *j)
982{
983 struct seccomp_filter *f = j->filters;
984 /* Unlink the tail and head */
985 if (f)
986 f->prev->next = NULL;
987 while (f) {
988 struct seccomp_filter *next = f->next;
989 free(f->filter);
990 free(f);
991 f = next;
992 }
Elly Jones51a5b6c2011-10-12 19:09:26 -0400993 while (j->bindings_head) {
994 struct binding *b = j->bindings_head;
995 j->bindings_head = j->bindings_head->next;
996 free(b->dest);
997 free(b->src);
998 free(b);
999 }
1000 j->bindings_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04001001 if (j->user)
1002 free(j->user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001003 if (j->chrootdir)
1004 free(j->chrootdir);
Elly Jonese1749eb2011-10-07 13:54:59 -04001005 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04001006}