blob: 94c6a9da43df4b329d1a898afd7d2000f2efc404 [file] [log] [blame]
Elly Jonescd7a9042011-07-22 13:56:51 -04001/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
7#define _GNU_SOURCE
Will Drewry32ac9f52011-08-18 21:36:27 -05008#include <ctype.h>
Elly Jonescd7a9042011-07-22 13:56:51 -04009#include <errno.h>
10#include <grp.h>
11#include <inttypes.h>
Will Drewryfe4a3722011-09-16 14:50:50 -050012#include <limits.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <linux/capability.h>
14#include <linux/securebits.h>
15#include <pwd.h>
16#include <sched.h>
17#include <signal.h>
Will Drewry2f54b6a2011-09-16 13:45:31 -050018#include <stdarg.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040019#include <stdio.h>
20#include <stdlib.h>
21#include <string.h>
22#include <syscall.h>
23#include <sys/capability.h>
24#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050025#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040026#include <sys/prctl.h>
27#include <sys/wait.h>
28#include <syslog.h>
29#include <unistd.h>
30
31#include "libminijail.h"
Will Drewry32ac9f52011-08-18 21:36:27 -050032#include "libsyscalls.h"
Elly Jonescd7a9042011-07-22 13:56:51 -040033#include "libminijail-private.h"
34
Will Drewry32ac9f52011-08-18 21:36:27 -050035/* Until these are reliably available in linux/prctl.h */
36#ifndef PR_SET_SECCOMP_FILTER
Elly Jonese1749eb2011-10-07 13:54:59 -040037# define PR_SECCOMP_FILTER_SYSCALL 0
38# define PR_SECCOMP_FILTER_EVENT 1
39# define PR_GET_SECCOMP_FILTER 35
40# define PR_SET_SECCOMP_FILTER 36
41# define PR_CLEAR_SECCOMP_FILTER 37
Will Drewry32ac9f52011-08-18 21:36:27 -050042#endif
43
Will Drewry32ac9f52011-08-18 21:36:27 -050044#define die(_msg, ...) do { \
Elly Jonese1749eb2011-10-07 13:54:59 -040045 syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
46 abort(); \
Will Drewry32ac9f52011-08-18 21:36:27 -050047} while (0)
Elly Jonescd7a9042011-07-22 13:56:51 -040048
Will Drewry32ac9f52011-08-18 21:36:27 -050049#define pdie(_msg, ...) \
Elly Jonese1749eb2011-10-07 13:54:59 -040050 die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
Will Drewry32ac9f52011-08-18 21:36:27 -050051
52#define warn(_msg, ...) \
Elly Jonese1749eb2011-10-07 13:54:59 -040053 syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
Elly Jonescd7a9042011-07-22 13:56:51 -040054
Will Drewryf89aef52011-09-16 16:48:57 -050055struct seccomp_filter {
Elly Jonese1749eb2011-10-07 13:54:59 -040056 int nr;
57 char *filter;
58 struct seccomp_filter *next, *prev;
Will Drewryf89aef52011-09-16 16:48:57 -050059};
60
61struct minijail {
Elly Jonese1749eb2011-10-07 13:54:59 -040062 struct {
63 int uid:1;
64 int gid:1;
65 int caps:1;
66 int vfs:1;
67 int pids:1;
68 int seccomp:1;
69 int readonly:1;
70 int usergroups:1;
71 int ptrace:1;
72 int seccomp_filter:1;
73 } flags;
74 uid_t uid;
75 gid_t gid;
76 gid_t usergid;
77 char *user;
78 uint64_t caps;
79 pid_t initpid;
80 int filter_count;
81 struct seccomp_filter *filters;
Will Drewryf89aef52011-09-16 16:48:57 -050082};
83
Elly Jonese1749eb2011-10-07 13:54:59 -040084struct minijail *minijail_new(void)
85{
86 struct minijail *j = malloc(sizeof(*j));
87 if (j)
88 memset(j, 0, sizeof(*j));
89 return j;
Elly Jonescd7a9042011-07-22 13:56:51 -040090}
91
Elly Jonese1749eb2011-10-07 13:54:59 -040092void minijail_change_uid(struct minijail *j, uid_t uid)
93{
94 if (uid == 0)
95 die("useless change to uid 0");
96 j->uid = uid;
97 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -040098}
99
Elly Jonese1749eb2011-10-07 13:54:59 -0400100void minijail_change_gid(struct minijail *j, gid_t gid)
101{
102 if (gid == 0)
103 die("useless change to gid 0");
104 j->gid = gid;
105 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400106}
107
Elly Jonese1749eb2011-10-07 13:54:59 -0400108int minijail_change_user(struct minijail *j, const char *user)
109{
110 char *buf = NULL;
111 struct passwd pw;
112 struct passwd *ppw = NULL;
113 ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX);
114 if (sz == -1)
115 sz = 65536; /* your guess is as good as mine... */
Elly Joneseb300c52011-09-22 14:35:43 -0400116
Elly Jonese1749eb2011-10-07 13:54:59 -0400117 /* sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return
118 * the maximum needed size of the buffer, so we don't have to search.
119 */
120 buf = malloc(sz);
121 if (!buf)
122 return -ENOMEM;
123 getpwnam_r(user, &pw, buf, sz, &ppw);
124 free(buf);
125 if (!ppw)
126 return -errno;
127 minijail_change_uid(j, ppw->pw_uid);
128 j->user = strdup(user);
129 if (!j->user)
130 return -ENOMEM;
131 j->usergid = ppw->pw_gid;
132 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400133}
134
Elly Jonese1749eb2011-10-07 13:54:59 -0400135int minijail_change_group(struct minijail *j, const char *group)
136{
137 char *buf = NULL;
138 struct group gr;
139 struct group *pgr = NULL;
140 ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX);
141 if (sz == -1)
142 sz = 65536; /* and mine is as good as yours, really */
Elly Joneseb300c52011-09-22 14:35:43 -0400143
Elly Jonese1749eb2011-10-07 13:54:59 -0400144 /* sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return
145 * the maximum needed size of the buffer, so we don't have to search.
146 */
147 buf = malloc(sz);
148 if (!buf)
149 return -ENOMEM;
150 getgrnam_r(group, &gr, buf, sz, &pgr);
151 free(buf);
152 if (!pgr)
153 return -errno;
154 minijail_change_gid(j, pgr->gr_gid);
155 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400156}
157
Elly Jonese1749eb2011-10-07 13:54:59 -0400158void minijail_use_seccomp(struct minijail *j)
159{
160 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400161}
162
Elly Jonese1749eb2011-10-07 13:54:59 -0400163void minijail_use_seccomp_filter(struct minijail *j)
164{
165 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500166}
167
Elly Jonese1749eb2011-10-07 13:54:59 -0400168void minijail_use_caps(struct minijail *j, uint64_t capmask)
169{
170 j->caps = capmask;
171 j->flags.caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400172}
173
Elly Jonese1749eb2011-10-07 13:54:59 -0400174void minijail_namespace_vfs(struct minijail *j)
175{
176 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400177}
178
Elly Jonese1749eb2011-10-07 13:54:59 -0400179void minijail_namespace_pids(struct minijail *j)
180{
181 j->flags.pids = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400182}
183
Elly Jonese1749eb2011-10-07 13:54:59 -0400184void minijail_remount_readonly(struct minijail *j)
185{
186 j->flags.vfs = 1;
187 j->flags.readonly = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400188}
189
Elly Jonese1749eb2011-10-07 13:54:59 -0400190void minijail_inherit_usergroups(struct minijail *j)
191{
192 j->flags.usergroups = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400193}
194
Elly Jonese1749eb2011-10-07 13:54:59 -0400195void minijail_disable_ptrace(struct minijail *j)
196{
197 j->flags.ptrace = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400198}
199
Elly Jonese1749eb2011-10-07 13:54:59 -0400200int minijail_add_seccomp_filter(struct minijail *j, int nr, const char *filter)
201{
202 struct seccomp_filter *sf;
203 if (!filter || nr < 0)
204 return -EINVAL;
Will Drewry32ac9f52011-08-18 21:36:27 -0500205
Elly Jonese1749eb2011-10-07 13:54:59 -0400206 sf = malloc(sizeof(*sf));
207 if (!sf)
208 return -ENOMEM;
209 sf->nr = nr;
210 sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
211 if (!sf->filter) {
212 free(sf);
213 return -ENOMEM;
214 }
Will Drewry32ac9f52011-08-18 21:36:27 -0500215
Elly Jonese1749eb2011-10-07 13:54:59 -0400216 j->filter_count++;
Will Drewryf89aef52011-09-16 16:48:57 -0500217
Elly Jonese1749eb2011-10-07 13:54:59 -0400218 if (!j->filters) {
219 j->filters = sf;
220 sf->next = sf;
221 sf->prev = sf;
222 return 0;
223 }
224 sf->next = j->filters;
225 sf->prev = j->filters->prev;
226 sf->prev->next = sf;
227 j->filters->prev = sf;
228 return 0;
Will Drewry32ac9f52011-08-18 21:36:27 -0500229}
230
Elly Jonese1749eb2011-10-07 13:54:59 -0400231int minijail_lookup_syscall(const char *name)
232{
233 const struct syscall_entry *entry = syscall_table;
234 for (; entry->name && entry->nr >= 0; ++entry)
235 if (!strcmp(entry->name, name))
236 return entry->nr;
237 return -1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500238}
239
Elly Jonese1749eb2011-10-07 13:54:59 -0400240static char *strip(char *s)
241{
242 char *end;
243 while (*s && isblank(*s))
244 s++;
245 end = s + strlen(s) - 1;
246 while (*end && (isblank(*end) || *end == '\n'))
247 end--;
248 *(end + 1) = '\0';
249 return s;
Will Drewry32ac9f52011-08-18 21:36:27 -0500250}
251
Elly Jonese1749eb2011-10-07 13:54:59 -0400252void minijail_parse_seccomp_filters(struct minijail *j, const char *path)
253{
254 FILE *file = fopen(path, "r");
255 char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
256 int count = 1;
257 if (!file)
258 pdie("failed to open seccomp filters file");
Will Drewry32ac9f52011-08-18 21:36:27 -0500259
Elly Jonese1749eb2011-10-07 13:54:59 -0400260 /* Format is simple:
261 * syscall_name<COLON><FILTER STRING>[\n|EOF]
262 * #...comment...
263 * <empty line?
264 */
265 while (fgets(line, sizeof(line), file)) {
266 char *filter = line;
267 char *name = strsep(&filter, ":");
268 char *name_end = NULL;
269 int nr = -1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500270
Elly Jonese1749eb2011-10-07 13:54:59 -0400271 if (!name)
272 die("invalid filter on line %d", count);
Will Drewry32ac9f52011-08-18 21:36:27 -0500273
Elly Jonese1749eb2011-10-07 13:54:59 -0400274 name = strip(name);
Will Drewry32ac9f52011-08-18 21:36:27 -0500275
Elly Jonese1749eb2011-10-07 13:54:59 -0400276 if (!filter) {
277 if (strlen(name))
278 die("invalid filter on line %d", count);
279 /* Allow empty lines */
280 continue;
281 }
Will Drewry32ac9f52011-08-18 21:36:27 -0500282
Elly Jonese1749eb2011-10-07 13:54:59 -0400283 /* Allow comment lines */
284 if (*name == '#')
285 continue;
Will Drewry32ac9f52011-08-18 21:36:27 -0500286
Elly Jonese1749eb2011-10-07 13:54:59 -0400287 filter = strip(filter);
Will Drewry32ac9f52011-08-18 21:36:27 -0500288
Elly Jonese1749eb2011-10-07 13:54:59 -0400289 /* Take direct syscall numbers */
290 nr = strtol(name, &name_end, 0);
291 /* Or fail-over to using names */
292 if (*name_end != '\0')
293 nr = minijail_lookup_syscall(name);
294 if (nr < 0)
295 die("syscall '%s' unknown", name);
Will Drewry32ac9f52011-08-18 21:36:27 -0500296
Elly Jonese1749eb2011-10-07 13:54:59 -0400297 if (minijail_add_seccomp_filter(j, nr, filter))
298 pdie("failed to add filter for syscall '%s'", name);
299 }
300 fclose(file);
Will Drewry32ac9f52011-08-18 21:36:27 -0500301}
302
Will Drewryf89aef52011-09-16 16:48:57 -0500303struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -0400304 size_t available;
305 size_t total;
306 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -0500307};
308
309static void marshal_state_init(struct marshal_state *state,
Elly Jonese1749eb2011-10-07 13:54:59 -0400310 char *buf, size_t available)
311{
312 state->available = available;
313 state->buf = buf;
314 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500315}
316
317static void marshal_append(struct marshal_state *state,
Elly Jonese1749eb2011-10-07 13:54:59 -0400318 char *src, size_t length)
319{
320 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -0500321
Elly Jonese1749eb2011-10-07 13:54:59 -0400322 /* Up to |available| will be written. */
323 if (copy_len) {
324 memcpy(state->buf, src, copy_len);
325 state->buf += copy_len;
326 state->available -= copy_len;
327 }
328 /* |total| will contain the expected length. */
329 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -0500330}
331
332static void minijail_marshal_helper(struct marshal_state *state,
Elly Jonese1749eb2011-10-07 13:54:59 -0400333 const struct minijail *j)
334{
335 marshal_append(state, (char *)j, sizeof(*j));
336 if (j->user)
337 marshal_append(state, j->user, strlen(j->user) + 1);
338 if (j->flags.seccomp_filter && j->filters) {
339 struct seccomp_filter *f = j->filters;
340 do {
341 marshal_append(state, (char *)&f->nr, sizeof(f->nr));
342 marshal_append(state, f->filter, strlen(f->filter) + 1);
343 f = f->next;
344 } while (f != j->filters);
345 }
Will Drewryf89aef52011-09-16 16:48:57 -0500346}
347
Elly Jonese1749eb2011-10-07 13:54:59 -0400348size_t minijail_size(const struct minijail *j)
349{
350 struct marshal_state state;
351 marshal_state_init(&state, NULL, 0);
352 minijail_marshal_helper(&state, j);
353 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -0500354}
355
Elly Jonese1749eb2011-10-07 13:54:59 -0400356int minijail_marshal(const struct minijail *j, char *buf, size_t available)
357{
358 struct marshal_state state;
359 marshal_state_init(&state, buf, available);
360 minijail_marshal_helper(&state, j);
361 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -0500362}
363
Elly Jonese1749eb2011-10-07 13:54:59 -0400364int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
365{
366 if (length < sizeof(*j))
367 return -EINVAL;
368 memcpy((void *)j, serialized, sizeof(*j));
369 serialized += sizeof(*j);
370 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -0500371
Elly Jonese1749eb2011-10-07 13:54:59 -0400372 if (j->user) { /* stale pointer */
373 if (!length)
374 return -EINVAL;
375 j->user = strndup(serialized, length);
376 length -= strlen(j->user) + 1;
377 serialized += strlen(j->user) + 1;
378 }
Will Drewryf89aef52011-09-16 16:48:57 -0500379
Elly Jonese1749eb2011-10-07 13:54:59 -0400380 if (j->flags.seccomp_filter && j->filter_count) {
381 int count = j->filter_count;
382 /* Let add_seccomp_filter recompute the value. */
383 j->filter_count = 0;
384 j->filters = NULL; /* Don't follow the stale pointer. */
385 for (; count > 0; --count) {
386 int *nr = (int *)serialized;
387 char *filter;
388 if (length < sizeof(*nr))
389 return -EINVAL;
390 length -= sizeof(*nr);
391 serialized += sizeof(*nr);
392 if (!length)
393 return -EINVAL;
394 filter = serialized;
395 if (minijail_add_seccomp_filter(j, *nr, filter))
396 return -EINVAL;
397 length -= strlen(filter) + 1;
398 serialized += strlen(filter) + 1;
399 }
400 }
401 return 0;
Will Drewry2ddaad02011-09-16 11:36:08 -0500402}
403
Elly Jonese1749eb2011-10-07 13:54:59 -0400404void minijail_preenter(struct minijail *j)
405{
406 /* Strip out options which are minijail_run() only. */
407 j->flags.vfs = 0;
408 j->flags.readonly = 0;
409 j->flags.pids = 0;
Will Drewryfe4a3722011-09-16 14:50:50 -0500410}
411
Elly Jonese1749eb2011-10-07 13:54:59 -0400412void minijail_preexec(struct minijail *j)
413{
414 int vfs = j->flags.vfs;
415 int readonly = j->flags.readonly;
416 if (j->user)
417 free(j->user);
418 j->user = NULL;
419 memset(&j->flags, 0, sizeof(j->flags));
420 /* Now restore anything we meant to keep. */
421 j->flags.vfs = vfs;
422 j->flags.readonly = readonly;
423 /* Note, pidns will already have been used before this call. */
Will Drewry2ddaad02011-09-16 11:36:08 -0500424}
425
Elly Jonese1749eb2011-10-07 13:54:59 -0400426static int remount_readonly(void)
427{
428 const char *kProcPath = "/proc";
429 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
430 /* Right now, we're holding a reference to our parent's old mount of
431 * /proc in our namespace, which means using MS_REMOUNT here would
432 * mutate our parent's mount as well, even though we're in a VFS
433 * namespace (!). Instead, remove their mount from our namespace
434 * and make our own.
435 */
436 if (umount(kProcPath))
437 return -errno;
438 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
439 return -errno;
440 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400441}
442
Elly Jonese1749eb2011-10-07 13:54:59 -0400443static void drop_caps(const struct minijail *j)
444{
445 cap_t caps = cap_get_proc();
446 cap_value_t raise_flag[1];
447 unsigned int i;
448 if (!caps)
449 die("can't get process caps");
450 if (cap_clear_flag(caps, CAP_INHERITABLE))
451 die("can't clear inheritable caps");
452 if (cap_clear_flag(caps, CAP_EFFECTIVE))
453 die("can't clear effective caps");
454 if (cap_clear_flag(caps, CAP_PERMITTED))
455 die("can't clear permitted caps");
456 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
457 if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
458 continue;
459 raise_flag[0] = i;
460 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
461 die("can't add effective cap");
462 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
463 die("can't add permitted cap");
464 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
465 die("can't add inheritable cap");
466 }
467 if (cap_set_proc(caps))
468 die("can't apply cleaned capset");
469 cap_free(caps);
470 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
471 if (j->caps & (1 << i))
472 continue;
473 if (prctl(PR_CAPBSET_DROP, i))
474 pdie("prctl(PR_CAPBSET_DROP)");
475 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400476}
477
Elly Jonese1749eb2011-10-07 13:54:59 -0400478static int setup_seccomp_filters(const struct minijail *j)
479{
480 const struct seccomp_filter *sf = j->filters;
481 int ret = 0;
482 int broaden = 0;
Will Drewry32ac9f52011-08-18 21:36:27 -0500483
Elly Jonese1749eb2011-10-07 13:54:59 -0400484 /* No filters installed isn't necessarily an error. */
485 if (!sf)
486 return ret;
Will Drewry32ac9f52011-08-18 21:36:27 -0500487
Elly Jonese1749eb2011-10-07 13:54:59 -0400488 do {
489 errno = 0;
490 ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
491 sf->nr, broaden ? "1" : sf->filter);
492 if (ret) {
493 switch (errno) {
494 case ENOSYS:
495 /* TODO(wad) make this a config option */
496 if (broaden)
497 die("CONFIG_SECCOMP_FILTER is not"
498 "supported by your kernel");
499 warn("missing CONFIG_FTRACE_SYSCALLS; relaxing"
500 "the filter for %d", sf->nr);
501 broaden = 1;
502 continue;
503 case E2BIG:
504 warn("seccomp filter too long: %d", sf->nr);
505 pdie("filter too long");
506 case ENOSPC:
507 pdie("too many seccomp filters");
508 case EPERM:
509 warn("syscall filter disallowed for %d",
510 sf->nr);
511 pdie("failed to install seccomp filter");
512 case EINVAL:
513 warn("seccomp filter or call method is"
514 " invalid. %d:'%s'", sf->nr, sf->filter);
515 default:
516 pdie("failed to install seccomp filter");
517 }
518 }
519 sf = sf->next;
520 broaden = 0;
521 } while (sf != j->filters);
522 return ret;
Will Drewry32ac9f52011-08-18 21:36:27 -0500523}
524
Elly Jonese1749eb2011-10-07 13:54:59 -0400525void minijail_enter(const struct minijail *j)
526{
527 if (j->flags.pids)
528 die("tried to enter a pid-namespaced jail;"
529 "try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -0400530
Elly Jonese1749eb2011-10-07 13:54:59 -0400531 if (j->flags.seccomp_filter && setup_seccomp_filters(j))
532 pdie("failed to configure seccomp filters");
Will Drewry32ac9f52011-08-18 21:36:27 -0500533
Elly Jonese1749eb2011-10-07 13:54:59 -0400534 if (j->flags.usergroups && !j->user)
535 die("usergroup inheritance without username");
Elly Jonescd7a9042011-07-22 13:56:51 -0400536
Elly Jonese1749eb2011-10-07 13:54:59 -0400537 /* We can't recover from failures if we've dropped privileges partially,
538 * so we don't even try. If any of our operations fail, we abort() the
539 * entire process.
540 */
541 if (j->flags.vfs && unshare(CLONE_NEWNS))
542 pdie("unshare");
Elly Jonescd7a9042011-07-22 13:56:51 -0400543
Elly Jonese1749eb2011-10-07 13:54:59 -0400544 if (j->flags.readonly && remount_readonly())
545 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -0400546
Elly Jonese1749eb2011-10-07 13:54:59 -0400547 if (j->flags.caps) {
548 /* POSIX capabilities are a bit tricky. If we drop our
549 * capability to change uids, our attempt to use setuid()
550 * below will fail. Hang on to root caps across setuid(), then
551 * lock securebits.
552 */
553 if (prctl(PR_SET_KEEPCAPS, 1))
554 pdie("prctl(PR_SET_KEEPCAPS)");
555 if (prctl
556 (PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
557 pdie("prctl(PR_SET_SECUREBITS)");
558 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400559
Elly Jonese1749eb2011-10-07 13:54:59 -0400560 if (j->flags.usergroups) {
561 if (initgroups(j->user, j->usergid))
562 pdie("initgroups");
563 } else {
564 /* Only attempt to clear supplemental groups if we are changing
565 * users. */
566 if ((j->uid || j->gid) && setgroups(0, NULL))
567 pdie("setgroups");
568 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400569
Elly Jonese1749eb2011-10-07 13:54:59 -0400570 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
571 pdie("setresgid");
Elly Jonescd7a9042011-07-22 13:56:51 -0400572
Elly Jonese1749eb2011-10-07 13:54:59 -0400573 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
574 pdie("setresuid");
Elly Jonescd7a9042011-07-22 13:56:51 -0400575
Elly Jonese1749eb2011-10-07 13:54:59 -0400576 if (j->flags.caps)
577 drop_caps(j);
Elly Jonescd7a9042011-07-22 13:56:51 -0400578
Elly Jonese1749eb2011-10-07 13:54:59 -0400579 /* seccomp has to come last since it cuts off all the other
580 * privilege-dropping syscalls :)
581 */
582 if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
583 pdie("prctl(PR_SET_SECCOMP, 13)");
Will Drewry32ac9f52011-08-18 21:36:27 -0500584
Elly Jonese1749eb2011-10-07 13:54:59 -0400585 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
586 pdie("prctl(PR_SET_SECCOMP)");
Elly Jonescd7a9042011-07-22 13:56:51 -0400587}
588
589static int init_exitstatus = 0;
590
Elly Jonese1749eb2011-10-07 13:54:59 -0400591static void init_term(int __attribute__ ((unused)) sig)
592{
593 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -0400594}
595
Elly Jonese1749eb2011-10-07 13:54:59 -0400596static int init(pid_t rootpid)
597{
598 pid_t pid;
599 int status;
600 /* so that we exit with the right status */
601 signal(SIGTERM, init_term);
602 /* TODO(wad) self jail with seccomp_filters here. */
603 while ((pid = wait(&status)) > 0) {
604 /* This loop will only end when either there are no processes
605 * left inside our pid namespace or we get a signal.
606 */
607 if (pid == rootpid)
608 init_exitstatus = status;
609 }
610 if (!WIFEXITED(init_exitstatus))
611 _exit(MINIJAIL_ERR_INIT);
612 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -0400613}
614
Elly Jonese1749eb2011-10-07 13:54:59 -0400615int minijail_from_fd(int fd, struct minijail *j)
616{
617 size_t sz = 0;
618 size_t bytes = read(fd, &sz, sizeof(sz));
619 char *buf;
620 int r;
621 if (sizeof(sz) != bytes)
622 return -EINVAL;
623 if (sz > USHRT_MAX) /* Arbitrary sanity check */
624 return -E2BIG;
625 buf = malloc(sz);
626 if (!buf)
627 return -ENOMEM;
628 bytes = read(fd, buf, sz);
629 if (bytes != sz) {
630 free(buf);
631 return -EINVAL;
632 }
633 r = minijail_unmarshal(j, buf, sz);
634 free(buf);
635 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500636}
637
Elly Jonese1749eb2011-10-07 13:54:59 -0400638int minijail_to_fd(struct minijail *j, int fd)
639{
640 char *buf;
641 size_t sz = minijail_size(j);
642 ssize_t written;
643 int r;
Elly Jonescd7a9042011-07-22 13:56:51 -0400644
Elly Jonese1749eb2011-10-07 13:54:59 -0400645 if (!sz)
646 return -EINVAL;
647 buf = malloc(sz);
648 r = minijail_marshal(j, buf, sz);
649 if (r) {
650 free(buf);
651 return r;
652 }
653 /* Sends [size][minijail]. */
654 written = write(fd, &sz, sizeof(sz));
655 if (written != sizeof(sz)) {
656 free(buf);
657 return -EFAULT;
658 }
659 written = write(fd, buf, sz);
660 if (written < 0 || (size_t) written != sz) {
661 free(buf);
662 return -EFAULT;
663 }
664 free(buf);
665 return 0;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500666}
Elly Jonescd7a9042011-07-22 13:56:51 -0400667
Elly Jonese1749eb2011-10-07 13:54:59 -0400668static int setup_preload(void)
669{
670 char *oldenv = getenv(kLdPreloadEnvVar) ? : "";
671 char *newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
672 if (!newenv)
673 return -ENOMEM;
Elly Jonescd7a9042011-07-22 13:56:51 -0400674
Elly Jonese1749eb2011-10-07 13:54:59 -0400675 /* Only insert a separating space if we have something to separate... */
676 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "",
677 PRELOADPATH);
Elly Jonescd7a9042011-07-22 13:56:51 -0400678
Elly Jonese1749eb2011-10-07 13:54:59 -0400679 /* setenv() makes a copy of the string we give it */
680 setenv(kLdPreloadEnvVar, newenv, 1);
681 free(newenv);
682 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400683}
684
Elly Jonese1749eb2011-10-07 13:54:59 -0400685static int setup_pipe(int fds[2])
686{
687 int r = pipe(fds);
688 char fd_buf[11];
689 if (r)
690 return r;
691 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
692 if (r <= 0)
693 return -EINVAL;
694 setenv(kFdEnvVar, fd_buf, 1);
695 return 0;
Will Drewryf89aef52011-09-16 16:48:57 -0500696}
697
Elly Jonese1749eb2011-10-07 13:54:59 -0400698int minijail_run(struct minijail *j, const char *filename, char *const argv[])
699{
700 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
701 char *oldenv, *oldenv_copy = NULL;
702 pid_t child_pid;
703 int pipe_fds[2];
704 int ret;
Ben Chan541c7e52011-08-26 14:55:53 -0700705
Elly Jonese1749eb2011-10-07 13:54:59 -0400706 oldenv = getenv(kLdPreloadEnvVar);
707 if (oldenv) {
708 oldenv_copy = strdup(oldenv);
709 if (!oldenv_copy)
710 return -ENOMEM;
711 }
Will Drewryf89aef52011-09-16 16:48:57 -0500712
Elly Jonese1749eb2011-10-07 13:54:59 -0400713 if (setup_preload())
714 return -EFAULT;
Will Drewry2f54b6a2011-09-16 13:45:31 -0500715
Elly Jonese1749eb2011-10-07 13:54:59 -0400716 /* Before we fork(2) and execve(2) the child process, we need to open
717 * a pipe(2) to send the minijail configuration over.
718 */
719 if (setup_pipe(pipe_fds))
720 return -EFAULT;
Elly Jonescd7a9042011-07-22 13:56:51 -0400721
Elly Jonese1749eb2011-10-07 13:54:59 -0400722 child_pid = syscall(SYS_clone, pidns | SIGCHLD, NULL);
723 if (child_pid < 0) {
724 free(oldenv_copy);
725 return child_pid;
726 }
Will Drewryf89aef52011-09-16 16:48:57 -0500727
Elly Jonese1749eb2011-10-07 13:54:59 -0400728 if (child_pid) {
729 /* Restore parent's LD_PRELOAD. */
730 if (oldenv_copy) {
731 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
732 free(oldenv_copy);
733 } else {
734 unsetenv(kLdPreloadEnvVar);
735 }
736 unsetenv(kFdEnvVar);
737 j->initpid = child_pid;
738 close(pipe_fds[0]); /* read endpoint */
739 ret = minijail_to_fd(j, pipe_fds[1]);
740 close(pipe_fds[1]); /* write endpoint */
741 if (ret) {
742 kill(j->initpid, SIGKILL);
743 die("failed to send marshalled minijail");
744 }
745 return 0;
746 }
747 free(oldenv_copy);
Ben Chan541c7e52011-08-26 14:55:53 -0700748
Elly Jonese1749eb2011-10-07 13:54:59 -0400749 /* Drop everything that cannot be inherited across execve. */
750 minijail_preexec(j);
751 /* Jail this process and its descendants... */
752 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -0400753
Elly Jonese1749eb2011-10-07 13:54:59 -0400754 if (pidns) {
755 /* pid namespace: this process will become init inside the new
756 * namespace, so fork off a child to actually run the program
757 * (we don't want all programs we might exec to have to know
758 * how to be init).
759 */
760 child_pid = fork();
761 if (child_pid < 0)
762 _exit(child_pid);
763 else if (child_pid > 0)
764 init(child_pid); /* never returns */
765 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400766
Elly Jonese1749eb2011-10-07 13:54:59 -0400767 /* If we aren't pid-namespaced:
768 * calling process
769 * -> execve()-ing process
770 * If we are:
771 * calling process
772 * -> init()-ing process
773 * -> execve()-ing process
774 */
775 _exit(execve(filename, argv, environ));
Elly Jonescd7a9042011-07-22 13:56:51 -0400776}
777
Elly Jonese1749eb2011-10-07 13:54:59 -0400778int minijail_kill(struct minijail *j)
779{
780 int st;
781 if (kill(j->initpid, SIGTERM))
782 return -errno;
783 if (waitpid(j->initpid, &st, 0) < 0)
784 return -errno;
785 return st;
Elly Jonescd7a9042011-07-22 13:56:51 -0400786}
787
Elly Jonese1749eb2011-10-07 13:54:59 -0400788int minijail_wait(struct minijail *j)
789{
790 int st;
791 if (waitpid(j->initpid, &st, 0) < 0)
792 return -errno;
793 if (!WIFEXITED(st))
794 return MINIJAIL_ERR_JAIL;
795 return WEXITSTATUS(st);
Elly Jonescd7a9042011-07-22 13:56:51 -0400796}
797
Elly Jonese1749eb2011-10-07 13:54:59 -0400798void minijail_destroy(struct minijail *j)
799{
800 struct seccomp_filter *f = j->filters;
801 /* Unlink the tail and head */
802 if (f)
803 f->prev->next = NULL;
804 while (f) {
805 struct seccomp_filter *next = f->next;
806 free(f->filter);
807 free(f);
808 f = next;
809 }
810 if (j->user)
811 free(j->user);
812 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -0400813}