blob: 9e44c9145f1cb8c862fa54d9df8bf19bdeeb30e0 [file] [log] [blame]
Zach Reizner39aa26b2017-12-12 18:03:23 -08001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std;
Chirantan Ekbote448516e2018-07-24 16:07:42 -07006use std::cmp::min;
Zach Reizner55a9e502018-10-03 10:22:32 -07007use std::error;
Dylan Reid059a1882018-07-23 17:58:09 -07008use std::ffi::CStr;
Zach Reizner39aa26b2017-12-12 18:03:23 -08009use std::fmt;
Dylan Reid059a1882018-07-23 17:58:09 -070010use std::fs::{File, OpenOptions};
Zach Reizner55a9e502018-10-03 10:22:32 -070011use std::io::{self, stdin, Read};
Chirantan Ekbote448516e2018-07-24 16:07:42 -070012use std::mem;
Dylan Reid059a1882018-07-23 17:58:09 -070013use std::os::unix::io::{FromRawFd, RawFd};
Zach Reizner39aa26b2017-12-12 18:03:23 -080014use std::os::unix::net::UnixDatagram;
15use std::path::{Path, PathBuf};
Chirantan Ekbote448516e2018-07-24 16:07:42 -070016use std::str;
Zach Reizner39aa26b2017-12-12 18:03:23 -080017use std::sync::atomic::{AtomicBool, Ordering};
Dylan Reid059a1882018-07-23 17:58:09 -070018use std::sync::{Arc, Barrier};
Zach Reizner39aa26b2017-12-12 18:03:23 -080019use std::thread;
20use std::thread::JoinHandle;
Zach Reizner55a9e502018-10-03 10:22:32 -070021use std::time::Duration;
Zach Reizner39aa26b2017-12-12 18:03:23 -080022
Chirantan Ekbote448516e2018-07-24 16:07:42 -070023use libc::{self, c_int};
Chirantan Ekbote448516e2018-07-24 16:07:42 -070024use rand::distributions::{IndependentSample, Range};
Zach Reizner55a9e502018-10-03 10:22:32 -070025use rand::thread_rng;
Zach Reizner39aa26b2017-12-12 18:03:23 -080026
Chirantan Ekbote448516e2018-07-24 16:07:42 -070027use byteorder::{ByteOrder, LittleEndian};
Daniel Verkamp56f283b2018-10-05 11:40:59 -070028use devices::{self, PciDevice, VirtioPciDevice};
Zach Reizner39aa26b2017-12-12 18:03:23 -080029use io_jail::{self, Minijail};
Zach Reizner39aa26b2017-12-12 18:03:23 -080030use kvm::*;
Jingkui Wange13b1802018-10-03 13:04:47 -070031use msg_socket::{MsgReceiver, MsgSender, UnlinkMsgSocket};
Jason D. Clinton865323d2017-09-27 22:04:03 -060032use net_util::Tap;
Daniel Verkampf02fdd12018-10-10 17:25:14 -070033use qcow::{self, ImageType, QcowFile};
Zach Reizner39aa26b2017-12-12 18:03:23 -080034use sys_util;
Zach Reizner55a9e502018-10-03 10:22:32 -070035use sys_util::*;
Jason D. Clinton865323d2017-09-27 22:04:03 -060036use vhost;
Jingkui Wange13b1802018-10-03 13:04:47 -070037use vm_control::{VmRequest, VmResponse};
Zach Reizner39aa26b2017-12-12 18:03:23 -080038
39use Config;
40
Dylan Reid059a1882018-07-23 17:58:09 -070041use arch::{self, LinuxArch, RunnableLinuxVm, VirtioDeviceStub, VmComponents};
Sonny Raoed517d12018-02-13 22:09:43 -080042
Sonny Rao2ffa0cb2018-02-26 17:27:40 -080043#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
44use aarch64::AArch64 as Arch;
Zach Reizner55a9e502018-10-03 10:22:32 -070045#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
46use x86_64::X8664arch as Arch;
Zach Reizner39aa26b2017-12-12 18:03:23 -080047
Dylan Reid059a1882018-07-23 17:58:09 -070048#[derive(Debug)]
Zach Reizner39aa26b2017-12-12 18:03:23 -080049pub enum Error {
Dylan Reid295ccac2017-11-06 14:06:24 -080050 BalloonDeviceNew(devices::virtio::BalloonError),
Zach Reizner39aa26b2017-12-12 18:03:23 -080051 BlockDeviceNew(sys_util::Error),
Mark Ryan6ed5aea2018-04-20 13:52:35 +010052 BlockSignal(sys_util::signal::Error),
Dylan Reid059a1882018-07-23 17:58:09 -070053 BuildingVm(Box<error::Error>),
Zach Reizner8fb52112017-12-13 16:04:39 -080054 CloneEventFd(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080055 CreateEventFd(sys_util::Error),
Zach Reizner5bed0d22018-03-28 02:31:11 -070056 CreatePollContext(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080057 CreateSignalFd(sys_util::SignalFdError),
58 CreateSocket(io::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070059 CreateTimerFd(sys_util::Error),
Daniel Verkampf02fdd12018-10-10 17:25:14 -070060 DetectImageType(qcow::Error),
Zach Reizner39aa26b2017-12-12 18:03:23 -080061 DeviceJail(io_jail::Error),
62 DevicePivotRoot(io_jail::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080063 Disk(io::Error),
Stephen Barberc79de2d2018-02-21 14:17:27 -080064 DiskImageLock(sys_util::Error),
Dylan Reid20566442018-04-02 15:06:15 -070065 FailedCLOEXECCheck,
66 FailedToDupFd,
Dylan Reid20566442018-04-02 15:06:15 -070067 InvalidFdPath,
Zach Reizner579bd2c2018-09-14 15:43:33 -070068 InvalidWaylandPath,
Zach Reizner8fb52112017-12-13 16:04:39 -080069 NetDeviceNew(devices::virtio::NetError),
70 NoVarEmpty,
71 OpenKernel(PathBuf, io::Error),
Chirantan Ekboteebd56812018-04-16 19:32:04 -070072 P9DeviceNew(devices::virtio::P9Error),
Zach Reizner5bed0d22018-03-28 02:31:11 -070073 PollContextAdd(sys_util::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070074 PollContextDelete(sys_util::Error),
Dylan Reid88624f82018-01-11 09:20:16 -080075 QcowDeviceCreate(qcow::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070076 ReadLowmemAvailable(io::Error),
77 ReadLowmemMargin(io::Error),
Dylan Reid0f579cb2018-07-09 15:39:34 -070078 RegisterBalloon(arch::DeviceRegistrationError),
79 RegisterBlock(arch::DeviceRegistrationError),
80 RegisterGpu(arch::DeviceRegistrationError),
81 RegisterNet(arch::DeviceRegistrationError),
82 RegisterP9(arch::DeviceRegistrationError),
83 RegisterRng(arch::DeviceRegistrationError),
Mark Ryan6ed5aea2018-04-20 13:52:35 +010084 RegisterSignalHandler(sys_util::Error),
Dylan Reid0f579cb2018-07-09 15:39:34 -070085 RegisterWayland(arch::DeviceRegistrationError),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070086 ResetTimerFd(sys_util::Error),
Zach Reizner39aa26b2017-12-12 18:03:23 -080087 RngDeviceNew(devices::virtio::RngError),
Zach Reizner8fb52112017-12-13 16:04:39 -080088 SettingGidMap(io_jail::Error),
89 SettingUidMap(io_jail::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080090 SignalFd(sys_util::SignalFdError),
91 SpawnVcpu(io::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070092 TimerFd(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080093 VhostNetDeviceNew(devices::virtio::vhost::Error),
94 VhostVsockDeviceNew(devices::virtio::vhost::Error),
Daniel Verkamp56f283b2018-10-05 11:40:59 -070095 VirtioPciDev(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080096 WaylandDeviceNew(sys_util::Error),
Sonny Raoed517d12018-02-13 22:09:43 -080097 LoadKernel(Box<error::Error>),
Zach Reizner39aa26b2017-12-12 18:03:23 -080098}
99
100impl fmt::Display for Error {
101 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
102 match self {
David Tolnay5bbbf612018-12-01 17:49:30 -0800103 Error::BalloonDeviceNew(e) => write!(f, "failed to create balloon: {:?}", e),
104 Error::BlockDeviceNew(e) => write!(f, "failed to create block device: {:?}", e),
105 Error::BlockSignal(e) => write!(f, "failed to block signal: {:?}", e),
106 Error::BuildingVm(e) => write!(f, "The architecture failed to build the vm: {:?}", e),
107 Error::CloneEventFd(e) => write!(f, "failed to clone eventfd: {:?}", e),
108 Error::CreateEventFd(e) => write!(f, "failed to create eventfd: {:?}", e),
109 Error::CreatePollContext(e) => write!(f, "failed to create poll context: {:?}", e),
110 Error::CreateSignalFd(e) => write!(f, "failed to create signalfd: {:?}", e),
111 Error::CreateSocket(e) => write!(f, "failed to create socket: {}", e),
112 Error::CreateTimerFd(e) => write!(f, "failed to create timerfd: {}", e),
113 Error::DetectImageType(e) => write!(f, "failed to detect disk image type: {:?}", e),
114 Error::DeviceJail(e) => write!(f, "failed to jail device: {}", e),
115 Error::DevicePivotRoot(e) => write!(f, "failed to pivot root device: {}", e),
116 Error::Disk(e) => write!(f, "failed to load disk image: {}", e),
117 Error::DiskImageLock(e) => write!(f, "failed to lock disk image: {:?}", e),
118 Error::FailedCLOEXECCheck => {
Dylan Reid20566442018-04-02 15:06:15 -0700119 write!(f, "/proc/self/fd argument failed check for CLOEXEC")
120 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800121 Error::FailedToDupFd => write!(f, "failed to dup fd from /proc/self/fd"),
122 Error::InvalidFdPath => write!(f, "failed parsing a /proc/self/fd/*"),
123 Error::InvalidWaylandPath => {
Zach Reizner579bd2c2018-09-14 15:43:33 -0700124 write!(f, "wayland socket path has no parent or file name")
125 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800126 Error::NetDeviceNew(e) => write!(f, "failed to set up virtio networking: {:?}", e),
127 Error::NoVarEmpty => write!(f, "/var/empty doesn't exist, can't jail devices."),
128 Error::OpenKernel(p, e) => write!(f, "failed to open kernel image {:?}: {}", p, e),
129 Error::P9DeviceNew(e) => write!(f, "failed to create 9p device: {}", e),
130 Error::PollContextAdd(e) => write!(f, "failed to add fd to poll context: {:?}", e),
131 Error::PollContextDelete(e) => {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700132 write!(f, "failed to remove fd from poll context: {:?}", e)
133 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800134 Error::QcowDeviceCreate(e) => write!(f, "failed to read qcow formatted file {:?}", e),
135 Error::ReadLowmemAvailable(e) => write!(
Zach Reizner55a9e502018-10-03 10:22:32 -0700136 f,
137 "failed to read /sys/kernel/mm/chromeos-low_mem/available: {}",
138 e
139 ),
David Tolnay5bbbf612018-12-01 17:49:30 -0800140 Error::ReadLowmemMargin(e) => write!(
Zach Reizner55a9e502018-10-03 10:22:32 -0700141 f,
142 "failed to read /sys/kernel/mm/chromeos-low_mem/margin: {}",
143 e
144 ),
David Tolnay5bbbf612018-12-01 17:49:30 -0800145 Error::RegisterBalloon(e) => write!(f, "error registering balloon device: {:?}", e),
146 Error::RegisterBlock(e) => write!(f, "error registering block device: {:?}", e),
147 Error::RegisterGpu(e) => write!(f, "error registering gpu device: {:?}", e),
148 Error::RegisterNet(e) => write!(f, "error registering net device: {:?}", e),
149 Error::RegisterP9(e) => write!(f, "error registering 9p device: {:?}", e),
150 Error::RegisterRng(e) => write!(f, "error registering rng device: {:?}", e),
151 Error::RegisterSignalHandler(e) => {
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100152 write!(f, "error registering signal handler: {:?}", e)
153 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800154 Error::RegisterWayland(e) => write!(f, "error registering wayland device: {}", e),
155 Error::ResetTimerFd(e) => write!(f, "failed to reset timerfd: {}", e),
156 Error::RngDeviceNew(e) => write!(f, "failed to set up rng: {:?}", e),
157 Error::SettingGidMap(e) => write!(f, "error setting GID map: {}", e),
158 Error::SettingUidMap(e) => write!(f, "error setting UID map: {}", e),
159 Error::SignalFd(e) => write!(f, "failed to read signal fd: {:?}", e),
160 Error::SpawnVcpu(e) => write!(f, "failed to spawn VCPU thread: {:?}", e),
161 Error::TimerFd(e) => write!(f, "failed to read timer fd: {:?}", e),
162 Error::VhostNetDeviceNew(e) => write!(f, "failed to set up vhost networking: {:?}", e),
163 Error::VhostVsockDeviceNew(e) => {
Zach Reizner8fb52112017-12-13 16:04:39 -0800164 write!(f, "failed to set up virtual socket device: {:?}", e)
165 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800166 Error::VirtioPciDev(e) => write!(f, "failed to create virtio pci dev: {}", e),
167 Error::WaylandDeviceNew(e) => write!(f, "failed to create wayland device: {:?}", e),
168 Error::LoadKernel(e) => write!(f, "failed to load kernel: {}", e),
Zach Reizner39aa26b2017-12-12 18:03:23 -0800169 }
170 }
171}
172
Dylan Reid059a1882018-07-23 17:58:09 -0700173impl std::error::Error for Error {
174 fn description(&self) -> &str {
175 "Some device failure"
176 }
177}
178
Zach Reizner39aa26b2017-12-12 18:03:23 -0800179type Result<T> = std::result::Result<T, Error>;
180
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700181// Verifies that |raw_fd| is actually owned by this process and duplicates it to ensure that
182// we have a unique handle to it.
Dylan Reid059a1882018-07-23 17:58:09 -0700183fn validate_raw_fd(raw_fd: RawFd) -> std::result::Result<RawFd, Box<error::Error>> {
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700184 // Checking that close-on-exec isn't set helps filter out FDs that were opened by
185 // crosvm as all crosvm FDs are close on exec.
186 // Safe because this doesn't modify any memory and we check the return value.
187 let flags = unsafe { libc::fcntl(raw_fd, libc::F_GETFD) };
188 if flags < 0 || (flags & libc::FD_CLOEXEC) != 0 {
Dylan Reid059a1882018-07-23 17:58:09 -0700189 return Err(Box::new(Error::FailedCLOEXECCheck));
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700190 }
191
192 // Duplicate the fd to ensure that we don't accidentally close an fd previously
193 // opened by another subsystem. Safe because this doesn't modify any memory and
194 // we check the return value.
195 let dup_fd = unsafe { libc::fcntl(raw_fd, libc::F_DUPFD_CLOEXEC, 0) };
196 if dup_fd < 0 {
Dylan Reid059a1882018-07-23 17:58:09 -0700197 return Err(Box::new(Error::FailedToDupFd));
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700198 }
199 Ok(dup_fd as RawFd)
200}
201
Zach Reizner39aa26b2017-12-12 18:03:23 -0800202fn create_base_minijail(root: &Path, seccomp_policy: &Path) -> Result<Minijail> {
203 // All child jails run in a new user namespace without any users mapped,
204 // they run as nobody unless otherwise configured.
David Tolnay5bbbf612018-12-01 17:49:30 -0800205 let mut j = Minijail::new().map_err(Error::DeviceJail)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800206 j.namespace_pids();
207 j.namespace_user();
208 j.namespace_user_disable_setgroups();
209 // Don't need any capabilities.
210 j.use_caps(0);
211 // Create a new mount namespace with an empty root FS.
212 j.namespace_vfs();
David Tolnay5bbbf612018-12-01 17:49:30 -0800213 j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800214 // Run in an empty network namespace.
215 j.namespace_net();
216 // Apply the block device seccomp policy.
217 j.no_new_privs();
Stephen Barber3b1d8a52018-01-06 17:34:51 -0800218 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill
219 // the entire device process if a worker thread commits a seccomp violation.
220 j.set_seccomp_filter_tsync();
Zach Reizner043ddc52018-04-03 20:47:21 -0700221 #[cfg(debug_assertions)]
222 j.log_seccomp_filter_failures();
Zach Reizner39aa26b2017-12-12 18:03:23 -0800223 j.parse_seccomp_filters(seccomp_policy)
David Tolnay5bbbf612018-12-01 17:49:30 -0800224 .map_err(Error::DeviceJail)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800225 j.use_seccomp_filter();
226 // Don't do init setup.
227 j.run_as_init();
228 Ok(j)
229}
230
Zach Reizner55a9e502018-10-03 10:22:32 -0700231fn create_virtio_devs(
Daniel Verkampaac28132018-10-15 14:58:48 -0700232 cfg: Config,
Zach Reizner55a9e502018-10-03 10:22:32 -0700233 mem: &GuestMemory,
234 _exit_evt: &EventFd,
235 wayland_device_socket: UnixDatagram,
236 balloon_device_socket: UnixDatagram,
Daniel Verkamp8eceba32018-10-18 16:45:13 -0700237) -> std::result::Result<Vec<(Box<PciDevice + 'static>, Option<Minijail>)>, Box<error::Error>> {
David Tolnay5bbbf612018-12-01 17:49:30 -0800238 static DEFAULT_PIVOT_ROOT: &str = "/var/empty";
Dylan Reidef7352f2018-05-17 18:47:11 -0700239
Dylan Reid059a1882018-07-23 17:58:09 -0700240 let mut devs = Vec::new();
Zach Reizner39aa26b2017-12-12 18:03:23 -0800241
242 // An empty directory for jailed device's pivot root.
243 let empty_root_path = Path::new(DEFAULT_PIVOT_ROOT);
244 if cfg.multiprocess && !empty_root_path.exists() {
Dylan Reid059a1882018-07-23 17:58:09 -0700245 return Err(Box::new(Error::NoVarEmpty));
Zach Reizner39aa26b2017-12-12 18:03:23 -0800246 }
247
Zach Reizner8fb52112017-12-13 16:04:39 -0800248 for disk in &cfg.disks {
Dylan Reid20566442018-04-02 15:06:15 -0700249 // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
250 let mut raw_image: File = if disk.path.parent() == Some(Path::new("/proc/self/fd")) {
251 if !disk.path.is_file() {
Dylan Reid059a1882018-07-23 17:58:09 -0700252 return Err(Box::new(Error::InvalidFdPath));
Dylan Reid20566442018-04-02 15:06:15 -0700253 }
Zach Reizner55a9e502018-10-03 10:22:32 -0700254 let raw_fd = disk
255 .path
256 .file_name()
Dylan Reid20566442018-04-02 15:06:15 -0700257 .and_then(|fd_osstr| fd_osstr.to_str())
258 .and_then(|fd_str| fd_str.parse::<c_int>().ok())
259 .ok_or(Error::InvalidFdPath)?;
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700260 // Safe because we will validate |raw_fd|.
261 unsafe { File::from_raw_fd(validate_raw_fd(raw_fd)?) }
Dylan Reid20566442018-04-02 15:06:15 -0700262 } else {
263 OpenOptions::new()
264 .read(true)
Daniel Verkampde9ae032018-08-09 16:26:59 -0700265 .write(!disk.read_only)
Dylan Reid20566442018-04-02 15:06:15 -0700266 .open(&disk.path)
David Tolnay5bbbf612018-12-01 17:49:30 -0800267 .map_err(Error::Disk)?
Dylan Reid20566442018-04-02 15:06:15 -0700268 };
Stephen Barberc79de2d2018-02-21 14:17:27 -0800269 // Lock the disk image to prevent other crosvm instances from using it.
Daniel Verkampde9ae032018-08-09 16:26:59 -0700270 let lock_op = if disk.read_only {
Stephen Barberc79de2d2018-02-21 14:17:27 -0800271 FlockOperation::LockShared
Daniel Verkampde9ae032018-08-09 16:26:59 -0700272 } else {
273 FlockOperation::LockExclusive
Stephen Barberc79de2d2018-02-21 14:17:27 -0800274 };
275 flock(&raw_image, lock_op, true).map_err(Error::DiskImageLock)?;
276
Daniel Verkampf02fdd12018-10-10 17:25:14 -0700277 let image_type = qcow::detect_image_type(&raw_image).map_err(Error::DetectImageType)?;
278 let block_box: Box<devices::virtio::VirtioDevice> = match image_type {
279 ImageType::Raw => {
Zach Reizner55a9e502018-10-03 10:22:32 -0700280 // Access as a raw block device.
281 Box::new(
282 devices::virtio::Block::new(raw_image, disk.read_only)
David Tolnay5bbbf612018-12-01 17:49:30 -0800283 .map_err(Error::BlockDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700284 )
Dylan Reid88624f82018-01-11 09:20:16 -0800285 }
Daniel Verkampf02fdd12018-10-10 17:25:14 -0700286 ImageType::Qcow2 => {
Zach Reizner55a9e502018-10-03 10:22:32 -0700287 // Valid qcow header present
David Tolnay5bbbf612018-12-01 17:49:30 -0800288 let qcow_image = QcowFile::from(raw_image).map_err(Error::QcowDeviceCreate)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700289 Box::new(
290 devices::virtio::Block::new(qcow_image, disk.read_only)
David Tolnay5bbbf612018-12-01 17:49:30 -0800291 .map_err(Error::BlockDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700292 )
Dylan Reid88624f82018-01-11 09:20:16 -0800293 }
294 };
Zach Reizner39aa26b2017-12-12 18:03:23 -0800295 let jail = if cfg.multiprocess {
296 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("block_device.policy");
297 Some(create_base_minijail(empty_root_path, &policy_path)?)
Zach Reizner55a9e502018-10-03 10:22:32 -0700298 } else {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800299 None
300 };
301
Zach Reizner55a9e502018-10-03 10:22:32 -0700302 devs.push(VirtioDeviceStub {
303 dev: block_box,
304 jail,
305 });
Zach Reizner39aa26b2017-12-12 18:03:23 -0800306 }
307
308 let rng_box = Box::new(devices::virtio::Rng::new().map_err(Error::RngDeviceNew)?);
309 let rng_jail = if cfg.multiprocess {
310 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("rng_device.policy");
311 Some(create_base_minijail(empty_root_path, &policy_path)?)
312 } else {
313 None
314 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700315 devs.push(VirtioDeviceStub {
316 dev: rng_box,
317 jail: rng_jail,
318 });
Zach Reizner39aa26b2017-12-12 18:03:23 -0800319
Zach Reizner55a9e502018-10-03 10:22:32 -0700320 let balloon_box = Box::new(
321 devices::virtio::Balloon::new(balloon_device_socket).map_err(Error::BalloonDeviceNew)?,
322 );
Dylan Reid295ccac2017-11-06 14:06:24 -0800323 let balloon_jail = if cfg.multiprocess {
324 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("balloon_device.policy");
325 Some(create_base_minijail(empty_root_path, &policy_path)?)
326 } else {
327 None
328 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700329 devs.push(VirtioDeviceStub {
330 dev: balloon_box,
331 jail: balloon_jail,
332 });
Dylan Reid295ccac2017-11-06 14:06:24 -0800333
Zach Reizner39aa26b2017-12-12 18:03:23 -0800334 // We checked above that if the IP is defined, then the netmask is, too.
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700335 if let Some(tap_fd) = cfg.tap_fd {
336 // Safe because we ensure that we get a unique handle to the fd.
337 let tap = unsafe { Tap::from_raw_fd(validate_raw_fd(tap_fd)?) };
David Tolnay5bbbf612018-12-01 17:49:30 -0800338 let net_box = Box::new(devices::virtio::Net::from(tap).map_err(Error::NetDeviceNew)?);
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700339
340 let jail = if cfg.multiprocess {
341 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("net_device.policy");
342
343 Some(create_base_minijail(empty_root_path, &policy_path)?)
344 } else {
345 None
346 };
347
Zach Reizner55a9e502018-10-03 10:22:32 -0700348 devs.push(VirtioDeviceStub { dev: net_box, jail });
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700349 } else if let Some(host_ip) = cfg.host_ip {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800350 if let Some(netmask) = cfg.netmask {
Stephen Barber308ff602018-02-13 22:47:07 -0800351 if let Some(mac_address) = cfg.mac_address {
352 let net_box: Box<devices::virtio::VirtioDevice> = if cfg.vhost_net {
Zach Reizner55a9e502018-10-03 10:22:32 -0700353 Box::new(
354 devices::virtio::vhost::Net::<Tap, vhost::Net<Tap>>::new(
355 host_ip,
356 netmask,
357 mac_address,
358 &mem,
David Tolnay2bac1e72018-12-12 14:33:42 -0800359 )
360 .map_err(Error::VhostNetDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700361 )
Zach Reizner39aa26b2017-12-12 18:03:23 -0800362 } else {
Zach Reizner55a9e502018-10-03 10:22:32 -0700363 Box::new(
364 devices::virtio::Net::<Tap>::new(host_ip, netmask, mac_address)
David Tolnay5bbbf612018-12-01 17:49:30 -0800365 .map_err(Error::NetDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700366 )
Zach Reizner39aa26b2017-12-12 18:03:23 -0800367 };
368
Stephen Barber308ff602018-02-13 22:47:07 -0800369 let jail = if cfg.multiprocess {
370 let policy_path: PathBuf = if cfg.vhost_net {
371 cfg.seccomp_policy_dir.join("vhost_net_device.policy")
372 } else {
373 cfg.seccomp_policy_dir.join("net_device.policy")
374 };
Zach Reizner39aa26b2017-12-12 18:03:23 -0800375
Stephen Barber308ff602018-02-13 22:47:07 -0800376 Some(create_base_minijail(empty_root_path, &policy_path)?)
377 } else {
378 None
379 };
380
Zach Reizner55a9e502018-10-03 10:22:32 -0700381 devs.push(VirtioDeviceStub { dev: net_box, jail });
Stephen Barber308ff602018-02-13 22:47:07 -0800382 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800383 }
384 }
385
Zach Reizneraa575662018-08-15 10:46:32 -0700386 #[cfg(feature = "gpu")]
387 let mut resource_bridge_wl_socket: Option<
388 devices::virtio::resource_bridge::ResourceRequestSocket,
389 > = None;
Zach Reizner3a8100a2017-09-13 19:15:43 -0700390 #[cfg(feature = "gpu")]
391 {
392 if cfg.gpu {
David Rileyb22b6132018-08-20 08:11:42 -0700393 if let Some(wayland_socket_path) = cfg.wayland_socket_path.as_ref() {
Zach Reizneraa575662018-08-15 10:46:32 -0700394 let (wl_socket, gpu_socket) =
395 devices::virtio::resource_bridge::pair().map_err(Error::CreateSocket)?;
396 resource_bridge_wl_socket = Some(wl_socket);
397
David Rileyb22b6132018-08-20 08:11:42 -0700398 let jailed_wayland_path = Path::new("/wayland-0");
399
Zach Reizner55a9e502018-10-03 10:22:32 -0700400 let gpu_box = Box::new(devices::virtio::Gpu::new(
401 _exit_evt.try_clone().map_err(Error::CloneEventFd)?,
Zach Reizneraa575662018-08-15 10:46:32 -0700402 Some(gpu_socket),
Zach Reizner55a9e502018-10-03 10:22:32 -0700403 if cfg.multiprocess {
404 &jailed_wayland_path
405 } else {
406 wayland_socket_path.as_path()
407 },
408 ));
David Rileyb22b6132018-08-20 08:11:42 -0700409
410 let jail = if cfg.multiprocess {
411 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("gpu_device.policy");
412 let mut jail = create_base_minijail(empty_root_path, &policy_path)?;
413
414 // Create a tmpfs in the device's root directory so that we can bind mount the
415 // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
Zach Reizner55a9e502018-10-03 10:22:32 -0700416 jail.mount_with_data(
417 Path::new("none"),
418 Path::new("/"),
419 "tmpfs",
420 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
421 "size=67108864",
David Tolnay2bac1e72018-12-12 14:33:42 -0800422 )
423 .unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700424
425 // Device nodes required for DRM.
426 let sys_dev_char_path = Path::new("/sys/dev/char");
427 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)
428 .unwrap();
429 let sys_devices_path = Path::new("/sys/devices");
430 jail.mount_bind(sys_devices_path, sys_devices_path, false)
431 .unwrap();
432 let drm_dri_path = Path::new("/dev/dri");
Zach Reizner55a9e502018-10-03 10:22:32 -0700433 jail.mount_bind(drm_dri_path, drm_dri_path, false).unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700434
435 // Libraries that are required when mesa drivers are dynamically loaded.
436 let lib_path = Path::new("/lib64");
Zach Reizner55a9e502018-10-03 10:22:32 -0700437 jail.mount_bind(lib_path, lib_path, false).unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700438 let usr_lib_path = Path::new("/usr/lib64");
Zach Reizner55a9e502018-10-03 10:22:32 -0700439 jail.mount_bind(usr_lib_path, usr_lib_path, false).unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700440
441 // Bind mount the wayland socket into jail's root. This is necessary since each
442 // new wayland context must open() the socket.
443 jail.mount_bind(wayland_socket_path.as_path(), jailed_wayland_path, true)
444 .unwrap();
445
446 // Set the uid/gid for the jailed process, and give a basic id map. This
447 // is required for the above bind mount to work.
448 let crosvm_user_group = CStr::from_bytes_with_nul(b"crosvm\0").unwrap();
449 let crosvm_uid = match get_user_id(&crosvm_user_group) {
450 Ok(u) => u,
451 Err(e) => {
452 warn!("falling back to current user id for gpu: {:?}", e);
453 geteuid()
454 }
455 };
456 let crosvm_gid = match get_group_id(&crosvm_user_group) {
457 Ok(u) => u,
458 Err(e) => {
459 warn!("falling back to current group id for gpu: {:?}", e);
460 getegid()
461 }
462 };
463 jail.change_uid(crosvm_uid);
464 jail.change_gid(crosvm_gid);
465 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
466 .map_err(Error::SettingUidMap)?;
467 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
468 .map_err(Error::SettingGidMap)?;
469
470 Some(jail)
471 } else {
472 None
473 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700474 devs.push(VirtioDeviceStub { dev: gpu_box, jail });
David Rileyb22b6132018-08-20 08:11:42 -0700475 }
Zach Reizner3a8100a2017-09-13 19:15:43 -0700476 }
477 }
478
Zach Reizneraa575662018-08-15 10:46:32 -0700479 if let Some(wayland_socket_path) = cfg.wayland_socket_path.as_ref() {
480 let wayland_socket_dir = wayland_socket_path
481 .parent()
482 .ok_or(Error::InvalidWaylandPath)?;
483 let wayland_socket_name = wayland_socket_path
484 .file_name()
485 .ok_or(Error::InvalidWaylandPath)?;
486 let jailed_wayland_dir = Path::new("/wayland");
487 let jailed_wayland_path = jailed_wayland_dir.join(wayland_socket_name);
488
489 #[cfg(not(feature = "gpu"))]
490 let resource_bridge_wl_socket = None;
491
492 let wl_box = Box::new(
493 devices::virtio::Wl::new(
494 if cfg.multiprocess {
495 &jailed_wayland_path
496 } else {
497 wayland_socket_path.as_path()
498 },
499 wayland_device_socket,
500 resource_bridge_wl_socket,
David Tolnay2bac1e72018-12-12 14:33:42 -0800501 )
502 .map_err(Error::WaylandDeviceNew)?,
Zach Reizneraa575662018-08-15 10:46:32 -0700503 );
504
505 let jail = if cfg.multiprocess {
506 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("wl_device.policy");
507 let mut jail = create_base_minijail(empty_root_path, &policy_path)?;
508
509 // Create a tmpfs in the device's root directory so that we can bind mount the wayland
510 // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
511 jail.mount_with_data(
512 Path::new("none"),
513 Path::new("/"),
514 "tmpfs",
515 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
516 "size=67108864",
David Tolnay2bac1e72018-12-12 14:33:42 -0800517 )
518 .unwrap();
Zach Reizneraa575662018-08-15 10:46:32 -0700519
520 // Bind mount the wayland socket's directory into jail's root. This is necessary since
521 // each new wayland context must open() the socket. If the wayland socket is ever
522 // destroyed and remade in the same host directory, new connections will be possible
523 // without restarting the wayland device.
524 jail.mount_bind(wayland_socket_dir, jailed_wayland_dir, true)
525 .unwrap();
526
527 // Set the uid/gid for the jailed process, and give a basic id map. This
528 // is required for the above bind mount to work.
529 let crosvm_user_group = CStr::from_bytes_with_nul(b"crosvm\0").unwrap();
530 let crosvm_uid = match get_user_id(&crosvm_user_group) {
531 Ok(u) => u,
532 Err(e) => {
533 warn!("falling back to current user id for Wayland: {:?}", e);
534 geteuid()
535 }
536 };
537 let crosvm_gid = match get_group_id(&crosvm_user_group) {
538 Ok(u) => u,
539 Err(e) => {
540 warn!("falling back to current group id for Wayland: {:?}", e);
541 getegid()
542 }
543 };
544 jail.change_uid(crosvm_uid);
545 jail.change_gid(crosvm_gid);
546 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
547 .map_err(Error::SettingUidMap)?;
548 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
549 .map_err(Error::SettingGidMap)?;
550
551 Some(jail)
552 } else {
553 None
554 };
555 devs.push(VirtioDeviceStub { dev: wl_box, jail });
556 }
557
558 if let Some(cid) = cfg.cid {
559 let vsock_box = Box::new(
560 devices::virtio::vhost::Vsock::new(cid, &mem).map_err(Error::VhostVsockDeviceNew)?,
561 );
562
563 let jail = if cfg.multiprocess {
564 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("vhost_vsock_device.policy");
565
566 Some(create_base_minijail(empty_root_path, &policy_path)?)
567 } else {
568 None
569 };
570
571 devs.push(VirtioDeviceStub {
572 dev: vsock_box,
573 jail,
574 });
575 }
576
Chirantan Ekboteebd56812018-04-16 19:32:04 -0700577 let chronos_user_group = CStr::from_bytes_with_nul(b"chronos\0").unwrap();
578 let chronos_uid = match get_user_id(&chronos_user_group) {
579 Ok(u) => u,
580 Err(e) => {
581 warn!("falling back to current user id for 9p: {:?}", e);
582 geteuid()
583 }
584 };
585 let chronos_gid = match get_group_id(&chronos_user_group) {
586 Ok(u) => u,
587 Err(e) => {
588 warn!("falling back to current group id for 9p: {:?}", e);
589 getegid()
590 }
591 };
592
593 for &(ref src, ref tag) in &cfg.shared_dirs {
594 let (jail, root) = if cfg.multiprocess {
595 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("9p_device.policy");
596 let mut jail = create_base_minijail(empty_root_path, &policy_path)?;
597
598 // The shared directory becomes the root of the device's file system.
599 let root = Path::new("/");
600 jail.mount_bind(&src, root, true).unwrap();
601
602 // Set the uid/gid for the jailed process, and give a basic id map. This
603 // is required for the above bind mount to work.
604 jail.change_uid(chronos_uid);
605 jail.change_gid(chronos_gid);
606 jail.uidmap(&format!("{0} {0} 1", chronos_uid))
607 .map_err(Error::SettingUidMap)?;
608 jail.gidmap(&format!("{0} {0} 1", chronos_gid))
609 .map_err(Error::SettingGidMap)?;
610
611 (Some(jail), root)
612 } else {
613 // There's no bind mount so we tell the server to treat the source directory as the
614 // root. The double deref here converts |src| from a &PathBuf into a &Path.
615 (None, &**src)
616 };
617
618 let p9_box = Box::new(devices::virtio::P9::new(root, tag).map_err(Error::P9DeviceNew)?);
619
Zach Reizner55a9e502018-10-03 10:22:32 -0700620 devs.push(VirtioDeviceStub { dev: p9_box, jail });
Chirantan Ekboteebd56812018-04-16 19:32:04 -0700621 }
622
Daniel Verkamp8eceba32018-10-18 16:45:13 -0700623 let mut pci_devices: Vec<(Box<PciDevice + 'static>, Option<Minijail>)> = Vec::new();
Daniel Verkamp56f283b2018-10-05 11:40:59 -0700624 for stub in devs {
625 let pci_dev =
626 Box::new(VirtioPciDevice::new((*mem).clone(), stub.dev).map_err(Error::VirtioPciDev)?);
Daniel Verkamp8eceba32018-10-18 16:45:13 -0700627 pci_devices.push((pci_dev, stub.jail));
Daniel Verkamp56f283b2018-10-05 11:40:59 -0700628 }
629
630 Ok(pci_devices)
Sonny Raobb7da422018-02-13 20:37:48 -0800631}
632
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100633fn setup_vcpu_signal_handler() -> Result<()> {
634 unsafe {
635 extern "C" fn handle_signal() {}
636 // Our signal handler does nothing and is trivially async signal safe.
637 register_signal_handler(SIGRTMIN() + 0, handle_signal)
638 .map_err(Error::RegisterSignalHandler)?;
639 }
640 block_signal(SIGRTMIN() + 0).map_err(Error::BlockSignal)?;
641 Ok(())
642}
643
Zach Reizner55a9e502018-10-03 10:22:32 -0700644fn run_vcpu(
645 vcpu: Vcpu,
646 cpu_id: u32,
647 start_barrier: Arc<Barrier>,
648 io_bus: devices::Bus,
649 mmio_bus: devices::Bus,
650 exit_evt: EventFd,
651 kill_signaled: Arc<AtomicBool>,
652) -> Result<JoinHandle<()>> {
Zach Reizner8fb52112017-12-13 16:04:39 -0800653 thread::Builder::new()
654 .name(format!("crosvm_vcpu{}", cpu_id))
655 .spawn(move || {
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100656 let mut sig_ok = true;
657 match get_blocked_signals() {
658 Ok(mut v) => {
659 v.retain(|&x| x != SIGRTMIN() + 0);
660 if let Err(e) = vcpu.set_signal_mask(&v) {
661 error!(
662 "Failed to set the KVM_SIGNAL_MASK for vcpu {} : {:?}",
663 cpu_id, e
664 );
665 sig_ok = false;
666 }
667 }
668 Err(e) => {
669 error!(
670 "Failed to retrieve signal mask for vcpu {} : {:?}",
671 cpu_id, e
672 );
673 sig_ok = false;
674 }
675 };
Zach Reizner39aa26b2017-12-12 18:03:23 -0800676
Zach Reizner8fb52112017-12-13 16:04:39 -0800677 start_barrier.wait();
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100678
David Tolnay8f3a2322018-11-30 17:11:35 -0800679 if sig_ok {
680 loop {
681 match vcpu.run() {
682 Ok(VcpuExit::IoIn { port, mut size }) => {
683 let mut data = [0; 8];
684 if size > data.len() {
685 error!("unsupported IoIn size of {} bytes", size);
686 size = data.len();
Zach Reizner39aa26b2017-12-12 18:03:23 -0800687 }
David Tolnay8f3a2322018-11-30 17:11:35 -0800688 io_bus.read(port as u64, &mut data[..size]);
689 if let Err(e) = vcpu.set_data(&data[..size]) {
690 error!("failed to set return data for IoIn: {:?}", e);
Zach Reizner39aa26b2017-12-12 18:03:23 -0800691 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800692 }
David Tolnay8f3a2322018-11-30 17:11:35 -0800693 Ok(VcpuExit::IoOut {
694 port,
695 mut size,
696 data,
697 }) => {
698 if size > data.len() {
699 error!("unsupported IoOut size of {} bytes", size);
700 size = data.len();
701 }
702 io_bus.write(port as u64, &data[..size]);
703 }
704 Ok(VcpuExit::MmioRead { address, size }) => {
705 let mut data = [0; 8];
706 mmio_bus.read(address, &mut data[..size]);
707 // Setting data for mmio can not fail.
708 let _ = vcpu.set_data(&data[..size]);
709 }
710 Ok(VcpuExit::MmioWrite {
711 address,
712 size,
713 data,
714 }) => {
715 mmio_bus.write(address, &data[..size]);
716 }
717 Ok(VcpuExit::Hlt) => break,
718 Ok(VcpuExit::Shutdown) => break,
719 Ok(VcpuExit::SystemEvent(_, _)) =>
720 //TODO handle reboot and crash events
721 {
722 kill_signaled.store(true, Ordering::SeqCst)
723 }
724 Ok(r) => warn!("unexpected vcpu exit: {:?}", r),
725 Err(e) => match e.errno() {
726 libc::EAGAIN | libc::EINTR => {}
727 _ => {
728 error!("vcpu hit unknown error: {:?}", e);
729 break;
730 }
731 },
Zach Reizner39aa26b2017-12-12 18:03:23 -0800732 }
David Tolnay8f3a2322018-11-30 17:11:35 -0800733 if kill_signaled.load(Ordering::SeqCst) {
734 break;
735 }
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100736
David Tolnay8f3a2322018-11-30 17:11:35 -0800737 // Try to clear the signal that we use to kick VCPU if it is
738 // pending before attempting to handle pause requests.
739 clear_signal(SIGRTMIN() + 0).expect("failed to clear pending signal");
740 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800741 }
Zach Reizner8fb52112017-12-13 16:04:39 -0800742 exit_evt
Zach Reizner39aa26b2017-12-12 18:03:23 -0800743 .write(1)
744 .expect("failed to signal vcpu exit eventfd");
David Tolnay2bac1e72018-12-12 14:33:42 -0800745 })
746 .map_err(Error::SpawnVcpu)
Zach Reizner39aa26b2017-12-12 18:03:23 -0800747}
748
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700749// Reads the contents of a file and converts them into a u64.
750fn file_to_u64<P: AsRef<Path>>(path: P) -> io::Result<u64> {
751 let mut file = File::open(path)?;
752
753 let mut buf = [0u8; 32];
754 let count = file.read(&mut buf)?;
755
Zach Reizner55a9e502018-10-03 10:22:32 -0700756 let content =
757 str::from_utf8(&buf[..count]).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
758 content
759 .trim()
760 .parse()
761 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700762}
763
Dylan Reid059a1882018-07-23 17:58:09 -0700764pub fn run_config(cfg: Config) -> Result<()> {
Daniel Verkampaac28132018-10-15 14:58:48 -0700765 if cfg.multiprocess {
Dylan Reid059a1882018-07-23 17:58:09 -0700766 // Printing something to the syslog before entering minijail so that libc's syslogger has a
767 // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
768 // access to those files will not be possible.
769 info!("crosvm entering multiprocess mode");
770 }
771
Dylan Reid059a1882018-07-23 17:58:09 -0700772 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
773 // before any jailed devices have been spawned, so that we can catch any of them that fail very
774 // quickly.
775 let sigchld_fd = SignalFd::new(libc::SIGCHLD).map_err(Error::CreateSignalFd)?;
776
777 let components = VmComponents {
Dylan Reid059a1882018-07-23 17:58:09 -0700778 memory_mb: (cfg.memory.unwrap_or(256) << 20) as u64,
779 vcpu_count: cfg.vcpu_count.unwrap_or(1),
780 kernel_image: File::open(cfg.kernel_path.as_path())
781 .map_err(|e| Error::OpenKernel(cfg.kernel_path.clone(), e))?,
Daniel Verkampaac28132018-10-15 14:58:48 -0700782 extra_kernel_params: cfg.params.clone(),
783 wayland_dmabuf: cfg.wayland_dmabuf,
Dylan Reid059a1882018-07-23 17:58:09 -0700784 };
785
786 let mut control_sockets = Vec::new();
787 if let Some(ref path_string) = cfg.socket_path {
788 let path = Path::new(path_string);
789 let dgram = UnixDatagram::bind(path).map_err(Error::CreateSocket)?;
Jingkui Wange13b1802018-10-03 13:04:47 -0700790 control_sockets.push(UnlinkMsgSocket::<VmResponse, VmRequest>::new(
791 UnlinkUnixDatagram(dgram),
792 ));
Dylan Reid059a1882018-07-23 17:58:09 -0700793 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700794 let (wayland_host_socket, wayland_device_socket) =
795 UnixDatagram::pair().map_err(Error::CreateSocket)?;
Jingkui Wange13b1802018-10-03 13:04:47 -0700796 control_sockets.push(UnlinkMsgSocket::<VmResponse, VmRequest>::new(
797 UnlinkUnixDatagram(wayland_host_socket),
798 ));
Dylan Reid059a1882018-07-23 17:58:09 -0700799 // Balloon gets a special socket so balloon requests can be forwarded from the main process.
Zach Reizner55a9e502018-10-03 10:22:32 -0700800 let (balloon_host_socket, balloon_device_socket) =
801 UnixDatagram::pair().map_err(Error::CreateSocket)?;
Dylan Reid059a1882018-07-23 17:58:09 -0700802
Zach Reizner55a9e502018-10-03 10:22:32 -0700803 let linux = Arch::build_vm(components, |m, e| {
Daniel Verkampaac28132018-10-15 14:58:48 -0700804 create_virtio_devs(cfg, m, e, wayland_device_socket, balloon_device_socket)
David Tolnay2bac1e72018-12-12 14:33:42 -0800805 })
806 .map_err(Error::BuildingVm)?;
Dylan Reid059a1882018-07-23 17:58:09 -0700807 run_control(linux, control_sockets, balloon_host_socket, sigchld_fd)
Dylan Reid0ed91ab2018-05-31 15:42:18 -0700808}
809
Zach Reizner55a9e502018-10-03 10:22:32 -0700810fn run_control(
811 mut linux: RunnableLinuxVm,
Jingkui Wange13b1802018-10-03 13:04:47 -0700812 control_sockets: Vec<UnlinkMsgSocket<VmResponse, VmRequest>>,
Zach Reizner55a9e502018-10-03 10:22:32 -0700813 balloon_host_socket: UnixDatagram,
814 sigchld_fd: SignalFd,
815) -> Result<()> {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700816 // Paths to get the currently available memory and the low memory threshold.
David Tolnay5bbbf612018-12-01 17:49:30 -0800817 const LOWMEM_MARGIN: &str = "/sys/kernel/mm/chromeos-low_mem/margin";
818 const LOWMEM_AVAILABLE: &str = "/sys/kernel/mm/chromeos-low_mem/available";
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700819
820 // The amount of additional memory to claim back from the VM whenever the system is
821 // low on memory.
822 const ONE_GB: u64 = (1 << 30);
823
Dylan Reid0ed91ab2018-05-31 15:42:18 -0700824 let max_balloon_memory = match linux.vm.get_memory().memory_size() {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700825 // If the VM has at least 1.5 GB, the balloon driver can consume all but the last 1 GB.
826 n if n >= (ONE_GB / 2) * 3 => n - ONE_GB,
827 // Otherwise, if the VM has at least 500MB the balloon driver will consume at most
828 // half of it.
829 n if n >= (ONE_GB / 2) => n / 2,
830 // Otherwise, the VM is too small for us to take memory away from it.
831 _ => 0,
832 };
833 let mut current_balloon_memory: u64 = 0;
834 let balloon_memory_increment: u64 = max_balloon_memory / 16;
835
Zach Reizner5bed0d22018-03-28 02:31:11 -0700836 #[derive(PollToken)]
837 enum Token {
838 Exit,
839 Stdin,
840 ChildSignal,
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700841 CheckAvailableMemory,
842 LowMemory,
843 LowmemTimer,
Zach Reizner5bed0d22018-03-28 02:31:11 -0700844 VmControl { index: usize },
845 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800846
847 let stdin_handle = stdin();
848 let stdin_lock = stdin_handle.lock();
849 stdin_lock
850 .set_raw_mode()
851 .expect("failed to set terminal raw mode");
852
Zach Reizner5bed0d22018-03-28 02:31:11 -0700853 let poll_ctx = PollContext::new().map_err(Error::CreatePollContext)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700854 poll_ctx
855 .add(&linux.exit_evt, Token::Exit)
856 .map_err(Error::PollContextAdd)?;
Zach Reizner5bed0d22018-03-28 02:31:11 -0700857 if let Err(e) = poll_ctx.add(&stdin_handle, Token::Stdin) {
858 warn!("failed to add stdin to poll context: {:?}", e);
859 }
Zach Reizner55a9e502018-10-03 10:22:32 -0700860 poll_ctx
861 .add(&sigchld_fd, Token::ChildSignal)
862 .map_err(Error::PollContextAdd)?;
Dylan Reid059a1882018-07-23 17:58:09 -0700863 for (index, socket) in control_sockets.iter().enumerate() {
Zach Reizner55a9e502018-10-03 10:22:32 -0700864 poll_ctx
865 .add(socket.as_ref(), Token::VmControl { index })
866 .map_err(Error::PollContextAdd)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800867 }
868
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700869 // Watch for low memory notifications and take memory back from the VM.
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700870 let low_mem = File::open("/dev/chromeos-low-mem").ok();
871 if let Some(ref low_mem) = low_mem {
Zach Reizner55a9e502018-10-03 10:22:32 -0700872 poll_ctx
873 .add(low_mem, Token::LowMemory)
874 .map_err(Error::PollContextAdd)?;
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700875 } else {
876 warn!("Unable to open low mem indicator, maybe not a chrome os kernel");
877 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700878
879 // Used to rate limit balloon requests.
880 let mut lowmem_timer = TimerFd::new().map_err(Error::CreateTimerFd)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700881 poll_ctx
882 .add(&lowmem_timer, Token::LowmemTimer)
883 .map_err(Error::PollContextAdd)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700884
885 // Used to check whether it's ok to start giving memory back to the VM.
886 let mut freemem_timer = TimerFd::new().map_err(Error::CreateTimerFd)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700887 poll_ctx
888 .add(&freemem_timer, Token::CheckAvailableMemory)
889 .map_err(Error::PollContextAdd)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700890
891 // Used to add jitter to timer values so that we don't have a thundering herd problem when
892 // multiple VMs are running.
893 let mut rng = thread_rng();
894 let lowmem_jitter_ms = Range::new(0, 200);
895 let freemem_jitter_secs = Range::new(0, 12);
896 let interval_jitter_secs = Range::new(0, 6);
897
Daniel Verkamp37c4a782019-01-04 10:44:17 -0800898 let mut vcpu_handles = Vec::with_capacity(linux.vcpus.len());
899 let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpus.len() + 1));
Dylan Reid059a1882018-07-23 17:58:09 -0700900 let kill_signaled = Arc::new(AtomicBool::new(false));
901 setup_vcpu_signal_handler()?;
902 for (cpu_id, vcpu) in linux.vcpus.into_iter().enumerate() {
Zach Reizner55a9e502018-10-03 10:22:32 -0700903 let handle = run_vcpu(
904 vcpu,
905 cpu_id as u32,
906 vcpu_thread_barrier.clone(),
907 linux.io_bus.clone(),
908 linux.mmio_bus.clone(),
909 linux.exit_evt.try_clone().map_err(Error::CloneEventFd)?,
910 kill_signaled.clone(),
911 )?;
Dylan Reid059a1882018-07-23 17:58:09 -0700912 vcpu_handles.push(handle);
913 }
914 vcpu_thread_barrier.wait();
915
Zach Reizner39aa26b2017-12-12 18:03:23 -0800916 'poll: loop {
Zach Reizner5bed0d22018-03-28 02:31:11 -0700917 let events = {
918 match poll_ctx.wait() {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800919 Ok(v) => v,
920 Err(e) => {
921 error!("failed to poll: {:?}", e);
922 break;
923 }
924 }
925 };
Zach Reizner5bed0d22018-03-28 02:31:11 -0700926 for event in events.iter_readable() {
927 match event.token() {
928 Token::Exit => {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800929 info!("vcpu requested shutdown");
930 break 'poll;
931 }
Zach Reizner5bed0d22018-03-28 02:31:11 -0700932 Token::Stdin => {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800933 let mut out = [0u8; 64];
934 match stdin_lock.read_raw(&mut out[..]) {
935 Ok(0) => {
936 // Zero-length read indicates EOF. Remove from pollables.
Zach Reizner5bed0d22018-03-28 02:31:11 -0700937 let _ = poll_ctx.delete(&stdin_handle);
Zach Reizner55a9e502018-10-03 10:22:32 -0700938 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800939 Err(e) => {
940 warn!("error while reading stdin: {:?}", e);
Zach Reizner5bed0d22018-03-28 02:31:11 -0700941 let _ = poll_ctx.delete(&stdin_handle);
Zach Reizner55a9e502018-10-03 10:22:32 -0700942 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800943 Ok(count) => {
Zach Reizner55a9e502018-10-03 10:22:32 -0700944 linux
945 .stdio_serial
Zach Reizner39aa26b2017-12-12 18:03:23 -0800946 .lock()
Zach Reizner39aa26b2017-12-12 18:03:23 -0800947 .queue_input_bytes(&out[..count])
948 .expect("failed to queue bytes into serial port");
Zach Reizner55a9e502018-10-03 10:22:32 -0700949 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800950 }
951 }
Zach Reizner5bed0d22018-03-28 02:31:11 -0700952 Token::ChildSignal => {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800953 // Print all available siginfo structs, then exit the loop.
David Tolnayf5032762018-12-03 10:46:45 -0800954 while let Some(siginfo) = sigchld_fd.read().map_err(Error::SignalFd)? {
955 error!(
956 "child {} died: signo {}, status {}, code {}",
957 siginfo.ssi_pid,
958 siginfo.ssi_signo,
959 siginfo.ssi_status,
960 siginfo.ssi_code
961 );
Zach Reizner39aa26b2017-12-12 18:03:23 -0800962 }
David Tolnayf5032762018-12-03 10:46:45 -0800963 break 'poll;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800964 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700965 Token::CheckAvailableMemory => {
966 // Acknowledge the timer.
967 freemem_timer.wait().map_err(Error::TimerFd)?;
968 if current_balloon_memory == 0 {
969 // Nothing to see here.
970 if let Err(e) = freemem_timer.clear() {
971 warn!("unable to clear available memory check timer: {}", e);
972 }
973 continue;
974 }
975
976 // Otherwise see if we can free up some memory.
977 let margin = file_to_u64(LOWMEM_MARGIN).map_err(Error::ReadLowmemMargin)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700978 let available =
979 file_to_u64(LOWMEM_AVAILABLE).map_err(Error::ReadLowmemAvailable)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700980
981 // `available` and `margin` are specified in MB while `balloon_memory_increment` is in
982 // bytes. So to correctly compare them we need to turn the increment value into MB.
Zach Reizner55a9e502018-10-03 10:22:32 -0700983 if available >= margin + 2 * (balloon_memory_increment >> 20) {
984 current_balloon_memory =
985 if current_balloon_memory >= balloon_memory_increment {
986 current_balloon_memory - balloon_memory_increment
987 } else {
988 0
989 };
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700990 let mut buf = [0u8; mem::size_of::<u64>()];
991 LittleEndian::write_u64(&mut buf, current_balloon_memory);
Dylan Reid059a1882018-07-23 17:58:09 -0700992 if let Err(e) = balloon_host_socket.send(&buf) {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700993 warn!("failed to send memory value to balloon device: {}", e);
994 }
995 }
996 }
997 Token::LowMemory => {
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700998 if let Some(ref low_mem) = low_mem {
999 let old_balloon_memory = current_balloon_memory;
Zach Reizner55a9e502018-10-03 10:22:32 -07001000 current_balloon_memory = min(
1001 current_balloon_memory + balloon_memory_increment,
1002 max_balloon_memory,
1003 );
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001004 if current_balloon_memory != old_balloon_memory {
1005 let mut buf = [0u8; mem::size_of::<u64>()];
1006 LittleEndian::write_u64(&mut buf, current_balloon_memory);
Dylan Reid059a1882018-07-23 17:58:09 -07001007 if let Err(e) = balloon_host_socket.send(&buf) {
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001008 warn!("failed to send memory value to balloon device: {}", e);
1009 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001010 }
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001011
1012 // Stop polling the lowmem device until the timer fires.
1013 poll_ctx.delete(low_mem).map_err(Error::PollContextDelete)?;
1014
1015 // Add some jitter to the timer so that if there are multiple VMs running
1016 // they don't all start ballooning at exactly the same time.
1017 let lowmem_dur =
1018 Duration::from_millis(1000 + lowmem_jitter_ms.ind_sample(&mut rng));
Zach Reizner55a9e502018-10-03 10:22:32 -07001019 lowmem_timer
1020 .reset(lowmem_dur, None)
1021 .map_err(Error::ResetTimerFd)?;
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001022
1023 // Also start a timer to check when we can start giving memory back. Do the
1024 // first check after a minute (with jitter) and subsequent checks after
1025 // every 30 seconds (with jitter).
1026 let freemem_dur =
1027 Duration::from_secs(60 + freemem_jitter_secs.ind_sample(&mut rng));
1028 let freemem_int =
1029 Duration::from_secs(30 + interval_jitter_secs.ind_sample(&mut rng));
1030 freemem_timer
1031 .reset(freemem_dur, Some(freemem_int))
1032 .map_err(Error::ResetTimerFd)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001033 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001034 }
1035 Token::LowmemTimer => {
1036 // Acknowledge the timer.
1037 lowmem_timer.wait().map_err(Error::TimerFd)?;
1038
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001039 if let Some(ref low_mem) = low_mem {
1040 // Start polling the lowmem device again.
Zach Reizner55a9e502018-10-03 10:22:32 -07001041 poll_ctx
1042 .add(low_mem, Token::LowMemory)
1043 .map_err(Error::PollContextAdd)?;
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001044 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001045 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001046 Token::VmControl { index } => {
Daniel Verkamp37c4a782019-01-04 10:44:17 -08001047 if let Some(socket) = control_sockets.get(index) {
Jingkui Wange13b1802018-10-03 13:04:47 -07001048 match socket.recv() {
Zach Reizner5bed0d22018-03-28 02:31:11 -07001049 Ok(request) => {
1050 let mut running = true;
Zach Reizner55a9e502018-10-03 10:22:32 -07001051 let response = request.execute(
1052 &mut linux.vm,
1053 &mut linux.resources,
1054 &mut running,
1055 &balloon_host_socket,
1056 );
Jingkui Wange13b1802018-10-03 13:04:47 -07001057 if let Err(e) = socket.send(&response) {
Zach Reizner5bed0d22018-03-28 02:31:11 -07001058 error!("failed to send VmResponse: {:?}", e);
1059 }
1060 if !running {
1061 info!("control socket requested exit");
1062 break 'poll;
1063 }
Zach Reizner39aa26b2017-12-12 18:03:23 -08001064 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001065 Err(e) => error!("failed to recv VmRequest: {:?}", e),
Zach Reizner39aa26b2017-12-12 18:03:23 -08001066 }
Zach Reizner39aa26b2017-12-12 18:03:23 -08001067 }
1068 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001069 }
1070 }
1071 for event in events.iter_hungup() {
1072 // It's possible more data is readable and buffered while the socket is hungup, so
1073 // don't delete the socket from the poll context until we're sure all the data is
1074 // read.
1075 if !event.readable() {
1076 match event.token() {
Zach Reizner55a9e502018-10-03 10:22:32 -07001077 Token::Exit => {}
Zach Reizner5bed0d22018-03-28 02:31:11 -07001078 Token::Stdin => {
1079 let _ = poll_ctx.delete(&stdin_handle);
Zach Reizner55a9e502018-10-03 10:22:32 -07001080 }
1081 Token::ChildSignal => {}
1082 Token::CheckAvailableMemory => {}
1083 Token::LowMemory => {}
1084 Token::LowmemTimer => {}
Zach Reizner5bed0d22018-03-28 02:31:11 -07001085 Token::VmControl { index } => {
Daniel Verkamp37c4a782019-01-04 10:44:17 -08001086 if let Some(socket) = control_sockets.get(index) {
Zach Reizner5bed0d22018-03-28 02:31:11 -07001087 let _ = poll_ctx.delete(socket.as_ref());
1088 }
Zach Reizner55a9e502018-10-03 10:22:32 -07001089 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001090 }
Zach Reizner39aa26b2017-12-12 18:03:23 -08001091 }
1092 }
1093 }
1094
1095 // vcpu threads MUST see the kill signaled flag, otherwise they may
1096 // re-enter the VM.
Dylan Reid059a1882018-07-23 17:58:09 -07001097 kill_signaled.store(true, Ordering::SeqCst);
1098 for handle in vcpu_handles {
Dmitry Torokhovcd405332018-02-16 16:25:54 -08001099 match handle.kill(SIGRTMIN() + 0) {
Zach Reizner39aa26b2017-12-12 18:03:23 -08001100 Ok(_) => {
1101 if let Err(e) = handle.join() {
1102 error!("failed to join vcpu thread: {:?}", e);
1103 }
1104 }
1105 Err(e) => error!("failed to kill vcpu thread: {:?}", e),
1106 }
1107 }
1108
1109 stdin_lock
1110 .set_canon_mode()
1111 .expect("failed to restore canonical mode for terminal");
1112
1113 Ok(())
1114}