| /* Copyright 2017 The Chromium OS Authors. All rights reserved. |
| * Use of this source code is governed by a BSD-style license that can be |
| * found in the LICENSE file. |
| */ |
| |
| #include "system.h" |
| |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <grp.h> |
| #include <net/if.h> |
| #include <pwd.h> |
| #include <stdbool.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <sys/ioctl.h> |
| #include <sys/prctl.h> |
| #include <sys/socket.h> |
| #include <sys/stat.h> |
| #include <sys/statvfs.h> |
| #include <unistd.h> |
| |
| #include <linux/securebits.h> |
| |
| #include "syscall_wrapper.h" |
| #include "util.h" |
| |
| /* |
| * SECBIT_NO_CAP_AMBIENT_RAISE was added in kernel 4.3, so fill in the |
| * definition if the securebits header doesn't provide it. |
| */ |
| #ifndef SECBIT_NO_CAP_AMBIENT_RAISE |
| #define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(6)) |
| #endif |
| |
| #ifndef SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED |
| #define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED (issecure_mask(7)) |
| #endif |
| |
| /* |
| * Assert the value of SECURE_ALL_BITS at compile-time. |
| * Android devices are currently compiled against 4.4 kernel headers. Kernel 4.3 |
| * added a new securebit. |
| * When a new securebit is added, the new SECURE_ALL_BITS mask will return EPERM |
| * when used on older kernels. The compile-time assert will catch this situation |
| * at compile time. |
| */ |
| #if defined(__ANDROID__) |
| _Static_assert(SECURE_ALL_BITS == 0x55, "SECURE_ALL_BITS == 0x55."); |
| #endif |
| |
| /* Used by lookup_(user|group) functions. */ |
| #define MAX_PWENT_SZ (1 << 20) |
| #define MAX_GRENT_SZ (1 << 20) |
| |
| int secure_noroot_set_and_locked(uint64_t mask) |
| { |
| return (mask & (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED)) == |
| (SECBIT_NOROOT | SECBIT_NOROOT_LOCKED); |
| } |
| |
| int lock_securebits(uint64_t skip_mask, bool require_keep_caps) |
| { |
| /* The general idea is to set all bits, subject to exceptions below. */ |
| unsigned long securebits = SECURE_ALL_BITS | SECURE_ALL_LOCKS; |
| |
| /* |
| * SECBIT_KEEP_CAPS is special in that it is automatically cleared on |
| * execve(2). This implies that attempts to set SECBIT_KEEP_CAPS (as is |
| * the default) in processes that have it locked already (such as nested |
| * minijail usage) would fail. Thus, unless the caller requires it, |
| * allow it to remain off if it is already locked. |
| */ |
| if (!require_keep_caps) { |
| int current_securebits = prctl(PR_GET_SECUREBITS); |
| if (current_securebits < 0) { |
| pwarn("prctl(PR_GET_SECUREBITS) failed"); |
| return -1; |
| } |
| |
| if ((current_securebits & SECBIT_KEEP_CAPS_LOCKED) != 0 && |
| (current_securebits & SECBIT_KEEP_CAPS) == 0) { |
| securebits &= ~SECBIT_KEEP_CAPS; |
| } |
| } |
| |
| /* |
| * Ambient capabilities can only be raised if they're already present |
| * in the permitted *and* inheritable set. Therefore, we don't really |
| * need to lock the NO_CAP_AMBIENT_RAISE securebit, since we are already |
| * configuring the permitted and inheritable set. |
| */ |
| securebits &= |
| ~(SECBIT_NO_CAP_AMBIENT_RAISE | SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED); |
| |
| /* Don't set any bits that the user requested not to be touched. */ |
| securebits &= ~skip_mask; |
| |
| if (!securebits) { |
| warn("not locking any securebits"); |
| return 0; |
| } |
| int securebits_ret = prctl(PR_SET_SECUREBITS, securebits); |
| if (securebits_ret < 0) { |
| pwarn("prctl(PR_SET_SECUREBITS) failed"); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| int write_proc_file(pid_t pid, const char *content, const char *basename) |
| { |
| int fd, ret; |
| size_t sz, len; |
| ssize_t written; |
| char filename[32]; |
| |
| sz = sizeof(filename); |
| ret = snprintf(filename, sz, "/proc/%d/%s", pid, basename); |
| if (ret < 0 || (size_t)ret >= sz) { |
| warn("failed to generate %s filename", basename); |
| return -1; |
| } |
| |
| fd = open(filename, O_WRONLY | O_CLOEXEC); |
| if (fd < 0) { |
| pwarn("failed to open '%s'", filename); |
| return -errno; |
| } |
| |
| len = strlen(content); |
| written = write(fd, content, len); |
| if (written < 0) { |
| pwarn("failed to write '%s'", filename); |
| return -errno; |
| } |
| |
| if ((size_t)written < len) { |
| warn("failed to write %zu bytes to '%s'", len, filename); |
| return -1; |
| } |
| close(fd); |
| return 0; |
| } |
| |
| /* |
| * We specifically do not use cap_valid() as that only tells us the last |
| * valid cap we were *compiled* against (i.e. what the version of kernel |
| * headers says). If we run on a different kernel version, then it's not |
| * uncommon for that to be less (if an older kernel) or more (if a newer |
| * kernel). |
| * Normally, we suck up the answer via /proc. On Android, not all processes are |
| * guaranteed to be able to access '/proc/sys/kernel/cap_last_cap' so we |
| * programmatically find the value by calling prctl(PR_CAPBSET_READ). |
| */ |
| unsigned int get_last_valid_cap(void) |
| { |
| unsigned int last_valid_cap = 0; |
| if (is_android()) { |
| for (; prctl(PR_CAPBSET_READ, last_valid_cap, 0, 0, 0) >= 0; |
| ++last_valid_cap) |
| ; |
| |
| /* |last_valid_cap| will be the first failing value. */ |
| if (last_valid_cap > 0) { |
| last_valid_cap--; |
| } |
| } else { |
| const char cap_file[] = "/proc/sys/kernel/cap_last_cap"; |
| FILE *fp = fopen(cap_file, "re"); |
| if (fscanf(fp, "%u", &last_valid_cap) != 1) |
| pdie("fscanf(%s)", cap_file); |
| fclose(fp); |
| } |
| return last_valid_cap; |
| } |
| |
| int cap_ambient_supported(void) |
| { |
| return prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_CHOWN, 0, 0) >= |
| 0; |
| } |
| |
| int config_net_loopback(void) |
| { |
| const char ifname[] = "lo"; |
| int sock; |
| struct ifreq ifr; |
| |
| /* Make sure people don't try to add really long names. */ |
| _Static_assert(sizeof(ifname) <= IFNAMSIZ, "interface name too long"); |
| |
| sock = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0); |
| if (sock < 0) { |
| pwarn("socket(AF_LOCAL) failed"); |
| return -1; |
| } |
| |
| /* |
| * Do the equiv of `ip link set up lo`. The kernel will assign |
| * IPv4 (127.0.0.1) & IPv6 (::1) addresses automatically! |
| */ |
| strcpy(ifr.ifr_name, ifname); |
| if (ioctl(sock, SIOCGIFFLAGS, &ifr) < 0) { |
| pwarn("ioctl(SIOCGIFFLAGS) failed"); |
| return -1; |
| } |
| |
| /* The kernel preserves ifr.ifr_name for use. */ |
| ifr.ifr_flags |= IFF_UP | IFF_RUNNING; |
| if (ioctl(sock, SIOCSIFFLAGS, &ifr) < 0) { |
| pwarn("ioctl(SIOCSIFFLAGS) failed"); |
| return -1; |
| } |
| |
| close(sock); |
| return 0; |
| } |
| |
| int write_pid_to_path(pid_t pid, const char *path) |
| { |
| FILE *fp = fopen(path, "we"); |
| |
| if (!fp) { |
| pwarn("failed to open '%s'", path); |
| return -errno; |
| } |
| if (fprintf(fp, "%d\n", (int)pid) < 0) { |
| /* fprintf(3) does not set errno on failure. */ |
| warn("fprintf(%s) failed", path); |
| return -1; |
| } |
| if (fclose(fp)) { |
| pwarn("fclose(%s) failed", path); |
| return -errno; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Create the |path| directory and its parents (if need be) with |mode|. |
| * If not |isdir|, then |path| is actually a file, so the last component |
| * will not be created. |
| */ |
| int mkdir_p(const char *path, mode_t mode, bool isdir) |
| { |
| int rc; |
| char *dir = strdup(path); |
| if (!dir) { |
| rc = errno; |
| pwarn("strdup(%s) failed", path); |
| return -rc; |
| } |
| |
| /* Starting from the root, work our way out to the end. */ |
| char *p = strchr(dir + 1, '/'); |
| while (p) { |
| *p = '\0'; |
| if (mkdir(dir, mode) && errno != EEXIST) { |
| rc = errno; |
| pwarn("mkdir(%s, 0%o) failed", dir, mode); |
| free(dir); |
| return -rc; |
| } |
| *p = '/'; |
| p = strchr(p + 1, '/'); |
| } |
| |
| /* |
| * Create the last directory. We still check EEXIST here in case |
| * of trailing slashes. |
| */ |
| free(dir); |
| if (isdir && mkdir(path, mode) && errno != EEXIST) { |
| rc = errno; |
| pwarn("mkdir(%s, 0%o) failed", path, mode); |
| return -rc; |
| } |
| return 0; |
| } |
| |
| /* |
| * setup_mount_destination: Ensures the mount target exists. |
| * Creates it if needed and possible. |
| */ |
| int setup_mount_destination(const char *source, const char *dest, uid_t uid, |
| uid_t gid, bool bind, unsigned long *mnt_flags) |
| { |
| int rc; |
| struct stat st_buf; |
| bool domkdir; |
| |
| rc = stat(dest, &st_buf); |
| if (rc == 0) /* destination exists */ |
| return 0; |
| |
| /* |
| * Try to create the destination. |
| * Either make a directory or touch a file depending on the source type. |
| * |
| * If the source isn't an absolute path, assume it is a filesystem type |
| * such as "tmpfs" and create a directory to mount it on. The dest will |
| * be something like "none" or "proc" which we shouldn't be checking. |
| */ |
| if (source[0] == '/') { |
| /* The source is an absolute path -- it better exist! */ |
| rc = stat(source, &st_buf); |
| if (rc) { |
| rc = errno; |
| pwarn("stat(%s) failed", source); |
| return -rc; |
| } |
| |
| /* |
| * If bind mounting, we only create a directory if the source |
| * is a directory, else we always bind mount it as a file to |
| * support device nodes, sockets, etc... |
| * |
| * For all other mounts, we assume a block/char source is |
| * going to want a directory to mount to. If the source is |
| * something else (e.g. a fifo or socket), this probably will |
| * not do the right thing, but we'll fail later on when we try |
| * to mount(), so shouldn't be a big deal. |
| */ |
| domkdir = S_ISDIR(st_buf.st_mode) || |
| (!bind && (S_ISBLK(st_buf.st_mode) || |
| S_ISCHR(st_buf.st_mode))); |
| |
| /* If bind mounting, also grab the mount flags of the source. */ |
| if (bind && mnt_flags) { |
| struct statvfs stvfs_buf; |
| rc = statvfs(source, &stvfs_buf); |
| if (rc) { |
| rc = errno; |
| pwarn( |
| "failed to look up mount flags: source=%s", |
| source); |
| return -rc; |
| } |
| *mnt_flags = stvfs_buf.f_flag; |
| } |
| } else { |
| /* The source is a relative path -- assume it's a pseudo fs. */ |
| |
| /* Disallow relative bind mounts. */ |
| if (bind) { |
| warn("relative bind-mounts are not allowed: source=%s", |
| source); |
| return -EINVAL; |
| } |
| |
| domkdir = true; |
| } |
| |
| /* |
| * Now that we know what we want to do, do it! |
| * We always create the intermediate dirs and the final path with 0755 |
| * perms and root/root ownership. This shouldn't be a problem because |
| * the actual mount will set those perms/ownership on the mount point |
| * which is all people should need to access it. |
| */ |
| rc = mkdir_p(dest, 0755, domkdir); |
| if (rc) |
| return rc; |
| if (!domkdir) { |
| int fd = open(dest, O_RDWR | O_CREAT | O_CLOEXEC, 0700); |
| if (fd < 0) { |
| rc = errno; |
| pwarn("open(%s) failed", dest); |
| return -rc; |
| } |
| close(fd); |
| } |
| if (chown(dest, uid, gid)) { |
| rc = errno; |
| pwarn("chown(%s, %u, %u) failed", dest, uid, gid); |
| return -rc; |
| } |
| return 0; |
| } |
| |
| /* |
| * lookup_user: Gets the uid/gid for the given username. |
| */ |
| int lookup_user(const char *user, uid_t *uid, gid_t *gid) |
| { |
| char *buf = NULL; |
| struct passwd pw; |
| struct passwd *ppw = NULL; |
| /* |
| * sysconf(_SC_GETPW_R_SIZE_MAX), under glibc, is documented to return |
| * a suggested starting size for the buffer, so let's try getting this |
| * size first, and fallback to a default othersise. |
| */ |
| ssize_t sz = sysconf(_SC_GETPW_R_SIZE_MAX); |
| if (sz == -1) |
| sz = 65536; /* your guess is as good as mine... */ |
| |
| do { |
| buf = malloc(sz); |
| if (!buf) |
| return -ENOMEM; |
| int err = getpwnam_r(user, &pw, buf, sz, &ppw); |
| /* |
| * We're safe to free the buffer here. The strings inside |pw| |
| * point inside |buf|, but we don't use any of them; this leaves |
| * the pointers dangling but it's safe. |
| * |ppw| points at |pw| if getpwnam_r(3) succeeded. |
| */ |
| free(buf); |
| if (err == ERANGE) { |
| /* |buf| was too small, retry with a bigger one. */ |
| sz <<= 1; |
| } else if (err != 0) { |
| /* We got an error not related to the size of |buf|. */ |
| return -err; |
| } else if (!ppw) { |
| /* Not found. */ |
| return -ENOENT; |
| } else { |
| *uid = ppw->pw_uid; |
| *gid = ppw->pw_gid; |
| return 0; |
| } |
| } while (sz <= MAX_PWENT_SZ); |
| |
| /* A buffer of size MAX_PWENT_SZ is still too small, return an error. */ |
| return -ERANGE; |
| } |
| |
| /* |
| * lookup_group: Gets the gid for the given group name. |
| */ |
| int lookup_group(const char *group, gid_t *gid) |
| { |
| char *buf = NULL; |
| struct group gr; |
| struct group *pgr = NULL; |
| /* |
| * sysconf(_SC_GETGR_R_SIZE_MAX), under glibc, is documented to return |
| * a suggested starting size for the buffer, so let's try getting this |
| * size first, and fallback to a default otherwise. |
| */ |
| ssize_t sz = sysconf(_SC_GETGR_R_SIZE_MAX); |
| if (sz == -1) |
| sz = 65536; /* and mine is as good as yours, really */ |
| |
| do { |
| buf = malloc(sz); |
| if (!buf) |
| return -ENOMEM; |
| int err = getgrnam_r(group, &gr, buf, sz, &pgr); |
| /* |
| * We're safe to free the buffer here. The strings inside |gr| |
| * point inside |buf|, but we don't use any of them; this leaves |
| * the pointers dangling but it's safe. |
| * |pgr| points at |gr| if getgrnam_r(3) succeeded. |
| */ |
| free(buf); |
| if (err == ERANGE) { |
| /* |buf| was too small, retry with a bigger one. */ |
| sz <<= 1; |
| } else if (err != 0) { |
| /* We got an error not related to the size of |buf|. */ |
| return -err; |
| } else if (!pgr) { |
| /* Not found. */ |
| return -ENOENT; |
| } else { |
| *gid = pgr->gr_gid; |
| return 0; |
| } |
| } while (sz <= MAX_GRENT_SZ); |
| |
| /* A buffer of size MAX_GRENT_SZ is still too small, return an error. */ |
| return -ERANGE; |
| } |
| |
| static bool seccomp_action_is_available(const char *wanted) |
| { |
| if (is_android()) { |
| /* |
| * Accessing |actions_avail| is generating SELinux denials, so |
| * skip for now. |
| * TODO(crbug.com/978022, jorgelo): Remove once the denial is |
| * fixed. |
| */ |
| return false; |
| } |
| const char actions_avail_path[] = |
| "/proc/sys/kernel/seccomp/actions_avail"; |
| FILE *f = fopen(actions_avail_path, "re"); |
| |
| if (!f) { |
| pwarn("fopen(%s) failed", actions_avail_path); |
| return false; |
| } |
| |
| char *actions_avail = NULL; |
| size_t buf_size = 0; |
| if (getline(&actions_avail, &buf_size, f) < 0) { |
| pwarn("getline() failed"); |
| free(actions_avail); |
| return false; |
| } |
| |
| /* |
| * This is just substring search, which means that partial matches will |
| * match too (e.g. "action" would match "longaction"). There are no |
| * seccomp actions which include other actions though, so we're good for |
| * now. Eventually we might want to split the string by spaces. |
| */ |
| bool available = strstr(actions_avail, wanted) != NULL; |
| free(actions_avail); |
| return available; |
| } |
| |
| int seccomp_ret_log_available(void) |
| { |
| static int ret_log_available = -1; |
| |
| if (ret_log_available == -1) |
| ret_log_available = seccomp_action_is_available("log"); |
| |
| return ret_log_available; |
| } |
| |
| int seccomp_ret_kill_process_available(void) |
| { |
| static int ret_kill_process_available = -1; |
| |
| if (ret_kill_process_available == -1) |
| ret_kill_process_available = |
| seccomp_action_is_available("kill_process"); |
| |
| return ret_kill_process_available; |
| } |
| |
| bool seccomp_filter_flags_available(unsigned int flags) |
| { |
| return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 || |
| errno != EINVAL; |
| } |