blob: 2afa236e60ce1b2b6b4a91f3c356c5b2ca8a3623 [file] [log] [blame]
Elly Jonescd7a9042011-07-22 13:56:51 -04001/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE file. */
4
5#define _BSD_SOURCE
6#define _GNU_SOURCE
7#include <errno.h>
8#include <grp.h>
9#include <inttypes.h>
10#include <linux/capability.h>
11#include <linux/securebits.h>
12#include <pwd.h>
13#include <sched.h>
14#include <signal.h>
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <syscall.h>
19#include <sys/capability.h>
20#include <sys/mount.h>
21#include <sys/prctl.h>
22#include <sys/wait.h>
23#include <syslog.h>
24#include <unistd.h>
25
26#include "libminijail.h"
27#include "libminijail-private.h"
28
29struct minijail {
30 struct {
31 int uid : 1;
32 int gid : 1;
33 int caps : 1;
34 int vfs : 1;
35 int pids : 1;
36 int seccomp : 1;
37 int readonly : 1;
38 int usergroups : 1;
39 int ptrace : 1;
40 } flags;
41 uid_t uid;
42 gid_t gid;
43 gid_t usergid;
44 const char *user;
45 uint64_t caps;
46 pid_t initpid;
47};
48
49static void pdie(const char *failed) {
50 syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno));
51 abort();
52}
53
54static void die(const char *failed) {
55 syslog(LOG_ERR, "libminijail: %s", failed);
56 abort();
57}
58
59struct minijail *minijail_new(void) {
60 struct minijail *j = malloc(sizeof(*j));
61 if (j)
62 memset(j, 0, sizeof(*j));
63 return j;
64}
65
66void minijail_change_uid(struct minijail *j, uid_t uid) {
67 if (uid == 0)
68 die("useless change to uid 0");
69 j->uid = uid;
70 j->flags.uid = 1;
71}
72
73void minijail_change_gid(struct minijail *j, gid_t gid) {
74 if (gid == 0)
75 die("useless change to gid 0");
76 j->gid = gid;
77 j->flags.gid = 1;
78}
79
80int minijail_change_user(struct minijail *j, const char *user) {
81 /* In principle this should use getpwnam(), but:
82 * 1) getpwnam_r() isn't actually reentrant anyway, since it uses a
83 * statically-allocated file descriptor internally
84 * 2) fgetpwnam() (by analogy with fgetpwent) would solve (1) except that it
85 * doesn't exist
86 * 3) sysconf() (see getpwnam_r(3)) is allowed to return a size that is not
87 * large enough, which means having to loop on growing the buffer we pass
88 * in
89 */
90 struct passwd *pw = getpwnam(user);
91 if (!pw)
92 return errno;
93 minijail_change_uid(j, pw->pw_uid);
94 j->user = user;
95 j->usergid = pw->pw_gid;
96 return 0;
97}
98
99int minijail_change_group(struct minijail *j, const char *group) {
100 /* In principle this should use getgrnam(), but:
101 * 1) getgrnam_r() isn't actually reentrant anyway, since it uses a
102 * statically-allocated file descriptor internally
103 * 2) fgetgrnam() (by analogy with fgetgrent) would solve (1) except that it
104 * doesn't exist
105 * 3) sysconf() (see getgrnam_r(3)) is allowed to return a size that is not
106 * large enough, which means having to loop on growing the buffer we pass
107 * in
108 */
109 struct group *gr = getgrnam(group);
110 if (!gr)
111 return errno;
112 minijail_change_gid(j, gr->gr_gid);
113 return 0;
114}
115
116void minijail_use_seccomp(struct minijail *j) {
117 j->flags.seccomp = 1;
118}
119
120void minijail_use_caps(struct minijail *j, uint64_t capmask) {
121 j->caps = capmask;
122 j->flags.caps = 1;
123}
124
125void minijail_namespace_vfs(struct minijail *j) {
126 j->flags.vfs = 1;
127}
128
129void minijail_namespace_pids(struct minijail *j) {
130 j->flags.pids = 1;
131}
132
133void minijail_remount_readonly(struct minijail *j) {
134 j->flags.vfs = 1;
135 j->flags.readonly = 1;
136}
137
138void minijail_inherit_usergroups(struct minijail *j) {
139 j->flags.usergroups = 1;
140}
141
142void minijail_disable_ptrace(struct minijail *j) {
143 j->flags.ptrace = 1;
144}
145
146static int remount_readonly(void) {
147 const char *kProcPath = "/proc";
148 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
149 /* Right now, we're holding a reference to our parent's old mount of /proc in
150 * our namespace, which means using MS_REMOUNT here would mutate our parent's
151 * mount as well, even though we're in a VFS namespace (!). Instead, remove
152 * their mount from our namespace and make our own. */
153 if (umount(kProcPath))
154 return errno;
155 if (mount("", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
156 return errno;
157 return 0;
158}
159
160static void drop_caps(const struct minijail *j) {
161 cap_t caps = cap_get_proc();
162 cap_value_t raise_flag[1];
163 unsigned int i;
164 if (!caps)
165 die("can't get process caps");
166 if (cap_clear_flag(caps, CAP_INHERITABLE))
167 die("can't clear inheritable caps");
168 if (cap_clear_flag(caps, CAP_EFFECTIVE))
169 die("can't clear effective caps");
170 if (cap_clear_flag(caps, CAP_PERMITTED))
171 die("can't clear permitted caps");
172 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
173 if (i != CAP_SETPCAP && !(j->caps & (1 << i)))
174 continue;
175 raise_flag[0] = i;
176 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, raise_flag, CAP_SET))
177 die("can't add effective cap");
178 if (cap_set_flag(caps, CAP_PERMITTED, 1, raise_flag, CAP_SET))
179 die("can't add permitted cap");
180 if (cap_set_flag(caps, CAP_INHERITABLE, 1, raise_flag, CAP_SET))
181 die("can't add inheritable cap");
182 }
183 if (cap_set_proc(caps))
184 die("can't apply cleaned capset");
185 cap_free(caps);
186 for (i = 0; i < sizeof(j->caps) * 8 && cap_valid((int)i); ++i) {
187 if (j->caps & (1 << i))
188 continue;
189 if (prctl(PR_CAPBSET_DROP, i))
190 pdie("prctl(PR_CAPBSET_DROP)");
191 }
192}
193
194void minijail_enter(const struct minijail *j) {
195 if (j->flags.pids)
196 die("tried to enter a pid-namespaced jail; try minijail_run()?");
197
198 if (j->flags.usergroups && !j->user)
199 die("usergroup inheritance without username");
200
201 /* We can't recover from failures if we've dropped privileges partially,
202 * so we don't even try. If any of our operations fail, we abort() the
203 * entire process. */
204 if (j->flags.vfs && unshare(CLONE_NEWNS))
205 pdie("unshare");
206
207 if (j->flags.readonly && remount_readonly())
208 pdie("remount");
209
210 if (j->flags.caps) {
211 /* POSIX capabilities are a bit tricky. If we drop our capability to change
212 * uids, our attempt to use setuid() below will fail. Hang on to root caps
213 * across setuid(), then lock securebits. */
214 if (prctl(PR_SET_KEEPCAPS, 1))
215 pdie("prctl(PR_SET_KEEPCAPS)");
216 if (prctl(PR_SET_SECUREBITS, SECURE_ALL_BITS | SECURE_ALL_LOCKS))
217 pdie("prctl(PR_SET_SECUREBITS)");
218 }
219
220 if (j->flags.usergroups && initgroups(j->user, j->usergid))
221 pdie("initgroups");
222 else if (!j->flags.usergroups && setgroups(0, NULL))
223 pdie("setgroups");
224
225 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
226 pdie("setresgid");
227
228 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
229 pdie("setresuid");
230
231 if (j->flags.caps)
232 drop_caps(j);
233
234 /* seccomp has to come last since it cuts off all the other
235 * privilege-dropping syscalls :) */
236 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
237 pdie("prctl(PR_SET_SECCOMP)");
238}
239
240static int init_exitstatus = 0;
241
242static void init_term(int __attribute__((unused)) sig) {
243 _exit(init_exitstatus);
244}
245
246static int init(pid_t rootpid) {
247 pid_t pid;
248 int status;
249 signal(SIGTERM, init_term); /* so that we exit with the right status */
250 while ((pid = wait(&status)) > 0) {
251 /* This loop will only end when either there are no processes left inside
252 * our pid namespace or we get a signal. */
253 if (pid == rootpid)
254 init_exitstatus = status;
255 }
256 if (!WIFEXITED(init_exitstatus))
257 _exit(MINIJAIL_ERR_INIT);
258 _exit(WEXITSTATUS(init_exitstatus));
259}
260
261/** @brief Move any commands that need to be done post-exec into an environment
262 * variable
263 * @param j Jail to move commands from.
264 *
265 * Serializes post-exec() commands into a string, removes them from the jail,
266 * and adds them to the environment; they will be deserialized later (see
267 * __minijail_preloaded) and executed inside the execve()'d process.
268 */
269static int move_commands_to_env(struct minijail *j) {
270 const int kEnvBufSize = 256;
271 const char *ptrace = j->flags.ptrace ? "ptrace " : "";
272 const char *seccomp = j->flags.seccomp ? "seccomp " : "";
273 char setuid[64] = "";
274 char caps[32] = "";
275 char *newenv;
276 char *oldenv;
277 char *envbuf = malloc(kEnvBufSize);
278 int r;
279
280 if (!envbuf)
281 return -ENOMEM;
282
283 if (j->flags.caps)
284 snprintf(caps, sizeof(caps), "caps=%" PRIx64 " ", j->caps);
285
286 if (j->flags.uid && j->flags.caps) {
287 snprintf(setuid, sizeof(setuid), "uid=%d ", j->uid);
288 j->flags.uid = 0;
289 }
290
291 j->flags.caps = 0;
292 j->flags.ptrace = 0;
293 j->flags.seccomp = 0;
294
295 r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
296 if (!r) {
297 /* No commands generated, so no preload needed :) */
298 free(envbuf);
299 return 0;
300 }
301 if (r == kEnvBufSize) {
302 free(envbuf);
303 return -E2BIG;
304 }
305
Ben Chan541c7e52011-08-26 14:55:53 -0700306 oldenv = getenv(kLdPreloadEnvVar) ? : "";
Elly Jonescd7a9042011-07-22 13:56:51 -0400307 newenv = malloc(strlen(oldenv) + 2 + strlen(PRELOADPATH));
308 if (!newenv) {
309 free(envbuf);
310 return -ENOMEM;
311 }
312
313 /* Only insert a separating space if we have something to separate... */
314 sprintf(newenv, "%s%s%s", oldenv, strlen(oldenv) ? " " : "", PRELOADPATH);
315
316 /* setenv() makes a copy of the string we give it */
Ben Chan541c7e52011-08-26 14:55:53 -0700317 setenv(kLdPreloadEnvVar, newenv, 1);
Elly Jonescd7a9042011-07-22 13:56:51 -0400318 setenv(kCommandEnvVar, envbuf, 1);
319 free(newenv);
320 free(envbuf);
321 return 0;
322}
323
324int minijail_run(struct minijail *j, const char *filename, char *const argv[]) {
325 unsigned int pidns = j->flags.pids ? CLONE_NEWPID : 0;
Ben Chan541c7e52011-08-26 14:55:53 -0700326 char *oldenv, *oldenv_copy = NULL;
Elly Jonescd7a9042011-07-22 13:56:51 -0400327 pid_t r;
Ben Chan541c7e52011-08-26 14:55:53 -0700328
329 oldenv = getenv(kLdPreloadEnvVar);
330 if (oldenv) {
331 oldenv_copy = strdup(oldenv);
332 if (!oldenv_copy)
333 return -ENOMEM;
334 }
335
Elly Jonescd7a9042011-07-22 13:56:51 -0400336 r = move_commands_to_env(j);
Ben Chan541c7e52011-08-26 14:55:53 -0700337 if (r) {
338 /* No environment variable is modified if move_commands_to_env returns
339 * a non-zero value. */
340 free(oldenv_copy);
Elly Jonescd7a9042011-07-22 13:56:51 -0400341 return r;
Ben Chan541c7e52011-08-26 14:55:53 -0700342 }
Elly Jonescd7a9042011-07-22 13:56:51 -0400343
344 r = syscall(SYS_clone, pidns | SIGCHLD, NULL);
345 if (r > 0) {
Ben Chan541c7e52011-08-26 14:55:53 -0700346 if (oldenv_copy) {
347 setenv(kLdPreloadEnvVar, oldenv_copy, 1);
348 free(oldenv_copy);
349 } else {
350 unsetenv(kLdPreloadEnvVar);
351 }
352 unsetenv(kCommandEnvVar);
Elly Jonescd7a9042011-07-22 13:56:51 -0400353 j->initpid = r;
354 return 0;
355 }
Ben Chan541c7e52011-08-26 14:55:53 -0700356
357 free(oldenv_copy);
358
Elly Jonescd7a9042011-07-22 13:56:51 -0400359 if (r < 0)
360 return r;
361
362 j->flags.pids = 0;
363
364 /* Jail this process and its descendants... */
365 minijail_enter(j);
366
367 if (pidns) {
368 /* pid namespace: this process will become init inside the new namespace, so
369 * fork off a child to actually run the program (we don't want all programs
370 * we might exec to have to know how to be init). */
371 r = fork();
372 if (r < 0)
373 _exit(r);
374 else if (r > 0)
375 init(r); /* never returns */
376 }
377
378 /* If we aren't pid-namespaced:
379 * calling process
380 * -> execve()-ing process
381 * If we are:
382 * calling process
383 * -> init()-ing process
384 * -> execve()-ing process
385 */
386 _exit(execve(filename, argv, environ));
387}
388
389int minijail_kill(struct minijail *j) {
390 int st;
391 if (kill(j->initpid, SIGTERM))
392 return errno;
393 if (waitpid(j->initpid, &st, 0) < 0)
394 return errno;
395 return st;
396}
397
398int minijail_wait(struct minijail *j) {
399 int st;
400 if (waitpid(j->initpid, &st, 0) < 0)
401 return errno;
402 if (!WIFEXITED(st))
403 return MINIJAIL_ERR_JAIL;
404 return WEXITSTATUS(st);
405}
406
407void minijail_destroy(struct minijail *j) {
408 free(j);
409}