blob: 5f607fbfa1bd070f71f9571dc969b04cebe022d7 [file] [log] [blame]
Zach Reizner39aa26b2017-12-12 18:03:23 -08001// Copyright 2017 The Chromium OS Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5use std;
Chirantan Ekbote448516e2018-07-24 16:07:42 -07006use std::cmp::min;
Zach Reizner55a9e502018-10-03 10:22:32 -07007use std::error;
Dylan Reid059a1882018-07-23 17:58:09 -07008use std::ffi::CStr;
Zach Reizner39aa26b2017-12-12 18:03:23 -08009use std::fmt;
Dylan Reid059a1882018-07-23 17:58:09 -070010use std::fs::{File, OpenOptions};
Zach Reizner55a9e502018-10-03 10:22:32 -070011use std::io::{self, stdin, Read};
Chirantan Ekbote448516e2018-07-24 16:07:42 -070012use std::mem;
Chirantan Ekbote2d292332018-11-16 11:35:24 -080013use std::os::unix::io::FromRawFd;
Zach Reizner39aa26b2017-12-12 18:03:23 -080014use std::os::unix::net::UnixDatagram;
15use std::path::{Path, PathBuf};
Chirantan Ekbote448516e2018-07-24 16:07:42 -070016use std::str;
Zach Reizner39aa26b2017-12-12 18:03:23 -080017use std::sync::atomic::{AtomicBool, Ordering};
Dylan Reid059a1882018-07-23 17:58:09 -070018use std::sync::{Arc, Barrier};
Zach Reizner39aa26b2017-12-12 18:03:23 -080019use std::thread;
20use std::thread::JoinHandle;
Zach Reizner55a9e502018-10-03 10:22:32 -070021use std::time::Duration;
Zach Reizner39aa26b2017-12-12 18:03:23 -080022
Chirantan Ekbote448516e2018-07-24 16:07:42 -070023use libc::{self, c_int};
Chirantan Ekbote448516e2018-07-24 16:07:42 -070024use rand::distributions::{IndependentSample, Range};
Zach Reizner55a9e502018-10-03 10:22:32 -070025use rand::thread_rng;
Zach Reizner39aa26b2017-12-12 18:03:23 -080026
Chirantan Ekbote448516e2018-07-24 16:07:42 -070027use byteorder::{ByteOrder, LittleEndian};
Daniel Verkamp56f283b2018-10-05 11:40:59 -070028use devices::{self, PciDevice, VirtioPciDevice};
Zach Reizner39aa26b2017-12-12 18:03:23 -080029use io_jail::{self, Minijail};
Zach Reizner39aa26b2017-12-12 18:03:23 -080030use kvm::*;
Jingkui Wange13b1802018-10-03 13:04:47 -070031use msg_socket::{MsgReceiver, MsgSender, UnlinkMsgSocket};
Jason D. Clinton865323d2017-09-27 22:04:03 -060032use net_util::Tap;
Daniel Verkampf02fdd12018-10-10 17:25:14 -070033use qcow::{self, ImageType, QcowFile};
Zach Reizner39aa26b2017-12-12 18:03:23 -080034use sys_util;
Zach Reizner55a9e502018-10-03 10:22:32 -070035use sys_util::*;
Jason D. Clinton865323d2017-09-27 22:04:03 -060036use vhost;
Jingkui Wange13b1802018-10-03 13:04:47 -070037use vm_control::{VmRequest, VmResponse};
Zach Reizner39aa26b2017-12-12 18:03:23 -080038
39use Config;
40
Dylan Reid059a1882018-07-23 17:58:09 -070041use arch::{self, LinuxArch, RunnableLinuxVm, VirtioDeviceStub, VmComponents};
Sonny Raoed517d12018-02-13 22:09:43 -080042
Sonny Rao2ffa0cb2018-02-26 17:27:40 -080043#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
44use aarch64::AArch64 as Arch;
Zach Reizner55a9e502018-10-03 10:22:32 -070045#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
46use x86_64::X8664arch as Arch;
Zach Reizner39aa26b2017-12-12 18:03:23 -080047
Dylan Reid059a1882018-07-23 17:58:09 -070048#[derive(Debug)]
Zach Reizner39aa26b2017-12-12 18:03:23 -080049pub enum Error {
Dylan Reid295ccac2017-11-06 14:06:24 -080050 BalloonDeviceNew(devices::virtio::BalloonError),
Zach Reizner39aa26b2017-12-12 18:03:23 -080051 BlockDeviceNew(sys_util::Error),
Mark Ryan6ed5aea2018-04-20 13:52:35 +010052 BlockSignal(sys_util::signal::Error),
Dylan Reid059a1882018-07-23 17:58:09 -070053 BuildingVm(Box<error::Error>),
Zach Reizner8fb52112017-12-13 16:04:39 -080054 CloneEventFd(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080055 CreateEventFd(sys_util::Error),
Zach Reizner5bed0d22018-03-28 02:31:11 -070056 CreatePollContext(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080057 CreateSignalFd(sys_util::SignalFdError),
58 CreateSocket(io::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070059 CreateTimerFd(sys_util::Error),
Daniel Verkampf02fdd12018-10-10 17:25:14 -070060 DetectImageType(qcow::Error),
Zach Reizner39aa26b2017-12-12 18:03:23 -080061 DeviceJail(io_jail::Error),
62 DevicePivotRoot(io_jail::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080063 Disk(io::Error),
Stephen Barberc79de2d2018-02-21 14:17:27 -080064 DiskImageLock(sys_util::Error),
Dylan Reid20566442018-04-02 15:06:15 -070065 InvalidFdPath,
Zach Reizner579bd2c2018-09-14 15:43:33 -070066 InvalidWaylandPath,
Zach Reizner8fb52112017-12-13 16:04:39 -080067 NetDeviceNew(devices::virtio::NetError),
68 NoVarEmpty,
69 OpenKernel(PathBuf, io::Error),
Chirantan Ekboteebd56812018-04-16 19:32:04 -070070 P9DeviceNew(devices::virtio::P9Error),
Zach Reizner5bed0d22018-03-28 02:31:11 -070071 PollContextAdd(sys_util::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070072 PollContextDelete(sys_util::Error),
Dylan Reid88624f82018-01-11 09:20:16 -080073 QcowDeviceCreate(qcow::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070074 ReadLowmemAvailable(io::Error),
75 ReadLowmemMargin(io::Error),
Dylan Reid0f579cb2018-07-09 15:39:34 -070076 RegisterBalloon(arch::DeviceRegistrationError),
77 RegisterBlock(arch::DeviceRegistrationError),
78 RegisterGpu(arch::DeviceRegistrationError),
79 RegisterNet(arch::DeviceRegistrationError),
80 RegisterP9(arch::DeviceRegistrationError),
81 RegisterRng(arch::DeviceRegistrationError),
Mark Ryan6ed5aea2018-04-20 13:52:35 +010082 RegisterSignalHandler(sys_util::Error),
Dylan Reid0f579cb2018-07-09 15:39:34 -070083 RegisterWayland(arch::DeviceRegistrationError),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070084 ResetTimerFd(sys_util::Error),
Zach Reizner39aa26b2017-12-12 18:03:23 -080085 RngDeviceNew(devices::virtio::RngError),
Zach Reizner8fb52112017-12-13 16:04:39 -080086 SettingGidMap(io_jail::Error),
87 SettingUidMap(io_jail::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080088 SignalFd(sys_util::SignalFdError),
89 SpawnVcpu(io::Error),
Chirantan Ekbote448516e2018-07-24 16:07:42 -070090 TimerFd(sys_util::Error),
Chirantan Ekbote2d292332018-11-16 11:35:24 -080091 ValidateRawFd(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080092 VhostNetDeviceNew(devices::virtio::vhost::Error),
93 VhostVsockDeviceNew(devices::virtio::vhost::Error),
Daniel Verkamp56f283b2018-10-05 11:40:59 -070094 VirtioPciDev(sys_util::Error),
Zach Reizner8fb52112017-12-13 16:04:39 -080095 WaylandDeviceNew(sys_util::Error),
Sonny Raoed517d12018-02-13 22:09:43 -080096 LoadKernel(Box<error::Error>),
Zach Reizner39aa26b2017-12-12 18:03:23 -080097}
98
99impl fmt::Display for Error {
100 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
101 match self {
David Tolnay5bbbf612018-12-01 17:49:30 -0800102 Error::BalloonDeviceNew(e) => write!(f, "failed to create balloon: {:?}", e),
103 Error::BlockDeviceNew(e) => write!(f, "failed to create block device: {:?}", e),
104 Error::BlockSignal(e) => write!(f, "failed to block signal: {:?}", e),
105 Error::BuildingVm(e) => write!(f, "The architecture failed to build the vm: {:?}", e),
106 Error::CloneEventFd(e) => write!(f, "failed to clone eventfd: {:?}", e),
107 Error::CreateEventFd(e) => write!(f, "failed to create eventfd: {:?}", e),
108 Error::CreatePollContext(e) => write!(f, "failed to create poll context: {:?}", e),
109 Error::CreateSignalFd(e) => write!(f, "failed to create signalfd: {:?}", e),
110 Error::CreateSocket(e) => write!(f, "failed to create socket: {}", e),
111 Error::CreateTimerFd(e) => write!(f, "failed to create timerfd: {}", e),
112 Error::DetectImageType(e) => write!(f, "failed to detect disk image type: {:?}", e),
113 Error::DeviceJail(e) => write!(f, "failed to jail device: {}", e),
114 Error::DevicePivotRoot(e) => write!(f, "failed to pivot root device: {}", e),
115 Error::Disk(e) => write!(f, "failed to load disk image: {}", e),
116 Error::DiskImageLock(e) => write!(f, "failed to lock disk image: {:?}", e),
David Tolnay5bbbf612018-12-01 17:49:30 -0800117 Error::InvalidFdPath => write!(f, "failed parsing a /proc/self/fd/*"),
118 Error::InvalidWaylandPath => {
Zach Reizner579bd2c2018-09-14 15:43:33 -0700119 write!(f, "wayland socket path has no parent or file name")
120 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800121 Error::NetDeviceNew(e) => write!(f, "failed to set up virtio networking: {:?}", e),
122 Error::NoVarEmpty => write!(f, "/var/empty doesn't exist, can't jail devices."),
123 Error::OpenKernel(p, e) => write!(f, "failed to open kernel image {:?}: {}", p, e),
124 Error::P9DeviceNew(e) => write!(f, "failed to create 9p device: {}", e),
125 Error::PollContextAdd(e) => write!(f, "failed to add fd to poll context: {:?}", e),
126 Error::PollContextDelete(e) => {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700127 write!(f, "failed to remove fd from poll context: {:?}", e)
128 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800129 Error::QcowDeviceCreate(e) => write!(f, "failed to read qcow formatted file {:?}", e),
130 Error::ReadLowmemAvailable(e) => write!(
Zach Reizner55a9e502018-10-03 10:22:32 -0700131 f,
132 "failed to read /sys/kernel/mm/chromeos-low_mem/available: {}",
133 e
134 ),
David Tolnay5bbbf612018-12-01 17:49:30 -0800135 Error::ReadLowmemMargin(e) => write!(
Zach Reizner55a9e502018-10-03 10:22:32 -0700136 f,
137 "failed to read /sys/kernel/mm/chromeos-low_mem/margin: {}",
138 e
139 ),
David Tolnay5bbbf612018-12-01 17:49:30 -0800140 Error::RegisterBalloon(e) => write!(f, "error registering balloon device: {:?}", e),
141 Error::RegisterBlock(e) => write!(f, "error registering block device: {:?}", e),
142 Error::RegisterGpu(e) => write!(f, "error registering gpu device: {:?}", e),
143 Error::RegisterNet(e) => write!(f, "error registering net device: {:?}", e),
144 Error::RegisterP9(e) => write!(f, "error registering 9p device: {:?}", e),
145 Error::RegisterRng(e) => write!(f, "error registering rng device: {:?}", e),
146 Error::RegisterSignalHandler(e) => {
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100147 write!(f, "error registering signal handler: {:?}", e)
148 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800149 Error::RegisterWayland(e) => write!(f, "error registering wayland device: {}", e),
150 Error::ResetTimerFd(e) => write!(f, "failed to reset timerfd: {}", e),
151 Error::RngDeviceNew(e) => write!(f, "failed to set up rng: {:?}", e),
152 Error::SettingGidMap(e) => write!(f, "error setting GID map: {}", e),
153 Error::SettingUidMap(e) => write!(f, "error setting UID map: {}", e),
154 Error::SignalFd(e) => write!(f, "failed to read signal fd: {:?}", e),
155 Error::SpawnVcpu(e) => write!(f, "failed to spawn VCPU thread: {:?}", e),
156 Error::TimerFd(e) => write!(f, "failed to read timer fd: {:?}", e),
Chirantan Ekbote2d292332018-11-16 11:35:24 -0800157 Error::ValidateRawFd(e) => write!(f, "failed to validate raw fd: {:?}", e),
David Tolnay5bbbf612018-12-01 17:49:30 -0800158 Error::VhostNetDeviceNew(e) => write!(f, "failed to set up vhost networking: {:?}", e),
159 Error::VhostVsockDeviceNew(e) => {
Zach Reizner8fb52112017-12-13 16:04:39 -0800160 write!(f, "failed to set up virtual socket device: {:?}", e)
161 }
David Tolnay5bbbf612018-12-01 17:49:30 -0800162 Error::VirtioPciDev(e) => write!(f, "failed to create virtio pci dev: {}", e),
163 Error::WaylandDeviceNew(e) => write!(f, "failed to create wayland device: {:?}", e),
164 Error::LoadKernel(e) => write!(f, "failed to load kernel: {}", e),
Zach Reizner39aa26b2017-12-12 18:03:23 -0800165 }
166 }
167}
168
Dylan Reid059a1882018-07-23 17:58:09 -0700169impl std::error::Error for Error {
170 fn description(&self) -> &str {
171 "Some device failure"
172 }
173}
174
Zach Reizner39aa26b2017-12-12 18:03:23 -0800175type Result<T> = std::result::Result<T, Error>;
176
Zach Reizner39aa26b2017-12-12 18:03:23 -0800177fn create_base_minijail(root: &Path, seccomp_policy: &Path) -> Result<Minijail> {
178 // All child jails run in a new user namespace without any users mapped,
179 // they run as nobody unless otherwise configured.
David Tolnay5bbbf612018-12-01 17:49:30 -0800180 let mut j = Minijail::new().map_err(Error::DeviceJail)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800181 j.namespace_pids();
182 j.namespace_user();
183 j.namespace_user_disable_setgroups();
184 // Don't need any capabilities.
185 j.use_caps(0);
186 // Create a new mount namespace with an empty root FS.
187 j.namespace_vfs();
David Tolnay5bbbf612018-12-01 17:49:30 -0800188 j.enter_pivot_root(root).map_err(Error::DevicePivotRoot)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800189 // Run in an empty network namespace.
190 j.namespace_net();
191 // Apply the block device seccomp policy.
192 j.no_new_privs();
Stephen Barber3b1d8a52018-01-06 17:34:51 -0800193 // Use TSYNC only for the side effect of it using SECCOMP_RET_TRAP, which will correctly kill
194 // the entire device process if a worker thread commits a seccomp violation.
195 j.set_seccomp_filter_tsync();
Zach Reizner043ddc52018-04-03 20:47:21 -0700196 #[cfg(debug_assertions)]
197 j.log_seccomp_filter_failures();
Zach Reizner39aa26b2017-12-12 18:03:23 -0800198 j.parse_seccomp_filters(seccomp_policy)
David Tolnay5bbbf612018-12-01 17:49:30 -0800199 .map_err(Error::DeviceJail)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800200 j.use_seccomp_filter();
201 // Don't do init setup.
202 j.run_as_init();
203 Ok(j)
204}
205
Zach Reizner55a9e502018-10-03 10:22:32 -0700206fn create_virtio_devs(
Daniel Verkampaac28132018-10-15 14:58:48 -0700207 cfg: Config,
Zach Reizner55a9e502018-10-03 10:22:32 -0700208 mem: &GuestMemory,
209 _exit_evt: &EventFd,
210 wayland_device_socket: UnixDatagram,
211 balloon_device_socket: UnixDatagram,
Daniel Verkamp8eceba32018-10-18 16:45:13 -0700212) -> std::result::Result<Vec<(Box<PciDevice + 'static>, Option<Minijail>)>, Box<error::Error>> {
David Tolnay5bbbf612018-12-01 17:49:30 -0800213 static DEFAULT_PIVOT_ROOT: &str = "/var/empty";
Dylan Reidef7352f2018-05-17 18:47:11 -0700214
Dylan Reid059a1882018-07-23 17:58:09 -0700215 let mut devs = Vec::new();
Zach Reizner39aa26b2017-12-12 18:03:23 -0800216
217 // An empty directory for jailed device's pivot root.
218 let empty_root_path = Path::new(DEFAULT_PIVOT_ROOT);
219 if cfg.multiprocess && !empty_root_path.exists() {
Dylan Reid059a1882018-07-23 17:58:09 -0700220 return Err(Box::new(Error::NoVarEmpty));
Zach Reizner39aa26b2017-12-12 18:03:23 -0800221 }
222
Zach Reizner8fb52112017-12-13 16:04:39 -0800223 for disk in &cfg.disks {
Dylan Reid20566442018-04-02 15:06:15 -0700224 // Special case '/proc/self/fd/*' paths. The FD is already open, just use it.
225 let mut raw_image: File = if disk.path.parent() == Some(Path::new("/proc/self/fd")) {
226 if !disk.path.is_file() {
Dylan Reid059a1882018-07-23 17:58:09 -0700227 return Err(Box::new(Error::InvalidFdPath));
Dylan Reid20566442018-04-02 15:06:15 -0700228 }
Zach Reizner55a9e502018-10-03 10:22:32 -0700229 let raw_fd = disk
230 .path
231 .file_name()
Dylan Reid20566442018-04-02 15:06:15 -0700232 .and_then(|fd_osstr| fd_osstr.to_str())
233 .and_then(|fd_str| fd_str.parse::<c_int>().ok())
234 .ok_or(Error::InvalidFdPath)?;
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700235 // Safe because we will validate |raw_fd|.
Chirantan Ekbote2d292332018-11-16 11:35:24 -0800236 unsafe { File::from_raw_fd(validate_raw_fd(raw_fd).map_err(Error::ValidateRawFd)?) }
Dylan Reid20566442018-04-02 15:06:15 -0700237 } else {
238 OpenOptions::new()
239 .read(true)
Daniel Verkampde9ae032018-08-09 16:26:59 -0700240 .write(!disk.read_only)
Dylan Reid20566442018-04-02 15:06:15 -0700241 .open(&disk.path)
David Tolnay5bbbf612018-12-01 17:49:30 -0800242 .map_err(Error::Disk)?
Dylan Reid20566442018-04-02 15:06:15 -0700243 };
Stephen Barberc79de2d2018-02-21 14:17:27 -0800244 // Lock the disk image to prevent other crosvm instances from using it.
Daniel Verkampde9ae032018-08-09 16:26:59 -0700245 let lock_op = if disk.read_only {
Stephen Barberc79de2d2018-02-21 14:17:27 -0800246 FlockOperation::LockShared
Daniel Verkampde9ae032018-08-09 16:26:59 -0700247 } else {
248 FlockOperation::LockExclusive
Stephen Barberc79de2d2018-02-21 14:17:27 -0800249 };
250 flock(&raw_image, lock_op, true).map_err(Error::DiskImageLock)?;
251
Daniel Verkampf02fdd12018-10-10 17:25:14 -0700252 let image_type = qcow::detect_image_type(&raw_image).map_err(Error::DetectImageType)?;
253 let block_box: Box<devices::virtio::VirtioDevice> = match image_type {
254 ImageType::Raw => {
Zach Reizner55a9e502018-10-03 10:22:32 -0700255 // Access as a raw block device.
256 Box::new(
257 devices::virtio::Block::new(raw_image, disk.read_only)
David Tolnay5bbbf612018-12-01 17:49:30 -0800258 .map_err(Error::BlockDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700259 )
Dylan Reid88624f82018-01-11 09:20:16 -0800260 }
Daniel Verkampf02fdd12018-10-10 17:25:14 -0700261 ImageType::Qcow2 => {
Zach Reizner55a9e502018-10-03 10:22:32 -0700262 // Valid qcow header present
David Tolnay5bbbf612018-12-01 17:49:30 -0800263 let qcow_image = QcowFile::from(raw_image).map_err(Error::QcowDeviceCreate)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700264 Box::new(
265 devices::virtio::Block::new(qcow_image, disk.read_only)
David Tolnay5bbbf612018-12-01 17:49:30 -0800266 .map_err(Error::BlockDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700267 )
Dylan Reid88624f82018-01-11 09:20:16 -0800268 }
269 };
Zach Reizner39aa26b2017-12-12 18:03:23 -0800270 let jail = if cfg.multiprocess {
271 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("block_device.policy");
272 Some(create_base_minijail(empty_root_path, &policy_path)?)
Zach Reizner55a9e502018-10-03 10:22:32 -0700273 } else {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800274 None
275 };
276
Zach Reizner55a9e502018-10-03 10:22:32 -0700277 devs.push(VirtioDeviceStub {
278 dev: block_box,
279 jail,
280 });
Zach Reizner39aa26b2017-12-12 18:03:23 -0800281 }
282
283 let rng_box = Box::new(devices::virtio::Rng::new().map_err(Error::RngDeviceNew)?);
284 let rng_jail = if cfg.multiprocess {
285 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("rng_device.policy");
286 Some(create_base_minijail(empty_root_path, &policy_path)?)
287 } else {
288 None
289 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700290 devs.push(VirtioDeviceStub {
291 dev: rng_box,
292 jail: rng_jail,
293 });
Zach Reizner39aa26b2017-12-12 18:03:23 -0800294
Zach Reizner55a9e502018-10-03 10:22:32 -0700295 let balloon_box = Box::new(
296 devices::virtio::Balloon::new(balloon_device_socket).map_err(Error::BalloonDeviceNew)?,
297 );
Dylan Reid295ccac2017-11-06 14:06:24 -0800298 let balloon_jail = if cfg.multiprocess {
299 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("balloon_device.policy");
300 Some(create_base_minijail(empty_root_path, &policy_path)?)
301 } else {
302 None
303 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700304 devs.push(VirtioDeviceStub {
305 dev: balloon_box,
306 jail: balloon_jail,
307 });
Dylan Reid295ccac2017-11-06 14:06:24 -0800308
Zach Reizner39aa26b2017-12-12 18:03:23 -0800309 // We checked above that if the IP is defined, then the netmask is, too.
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700310 if let Some(tap_fd) = cfg.tap_fd {
311 // Safe because we ensure that we get a unique handle to the fd.
312 let tap = unsafe { Tap::from_raw_fd(validate_raw_fd(tap_fd)?) };
David Tolnay5bbbf612018-12-01 17:49:30 -0800313 let net_box = Box::new(devices::virtio::Net::from(tap).map_err(Error::NetDeviceNew)?);
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700314
315 let jail = if cfg.multiprocess {
316 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("net_device.policy");
317
318 Some(create_base_minijail(empty_root_path, &policy_path)?)
319 } else {
320 None
321 };
322
Zach Reizner55a9e502018-10-03 10:22:32 -0700323 devs.push(VirtioDeviceStub { dev: net_box, jail });
Chirantan Ekbote5f787212018-05-31 15:31:31 -0700324 } else if let Some(host_ip) = cfg.host_ip {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800325 if let Some(netmask) = cfg.netmask {
Stephen Barber308ff602018-02-13 22:47:07 -0800326 if let Some(mac_address) = cfg.mac_address {
327 let net_box: Box<devices::virtio::VirtioDevice> = if cfg.vhost_net {
Zach Reizner55a9e502018-10-03 10:22:32 -0700328 Box::new(
329 devices::virtio::vhost::Net::<Tap, vhost::Net<Tap>>::new(
330 host_ip,
331 netmask,
332 mac_address,
333 &mem,
David Tolnay2bac1e72018-12-12 14:33:42 -0800334 )
335 .map_err(Error::VhostNetDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700336 )
Zach Reizner39aa26b2017-12-12 18:03:23 -0800337 } else {
Zach Reizner55a9e502018-10-03 10:22:32 -0700338 Box::new(
339 devices::virtio::Net::<Tap>::new(host_ip, netmask, mac_address)
David Tolnay5bbbf612018-12-01 17:49:30 -0800340 .map_err(Error::NetDeviceNew)?,
Zach Reizner55a9e502018-10-03 10:22:32 -0700341 )
Zach Reizner39aa26b2017-12-12 18:03:23 -0800342 };
343
Stephen Barber308ff602018-02-13 22:47:07 -0800344 let jail = if cfg.multiprocess {
345 let policy_path: PathBuf = if cfg.vhost_net {
346 cfg.seccomp_policy_dir.join("vhost_net_device.policy")
347 } else {
348 cfg.seccomp_policy_dir.join("net_device.policy")
349 };
Zach Reizner39aa26b2017-12-12 18:03:23 -0800350
Stephen Barber308ff602018-02-13 22:47:07 -0800351 Some(create_base_minijail(empty_root_path, &policy_path)?)
352 } else {
353 None
354 };
355
Zach Reizner55a9e502018-10-03 10:22:32 -0700356 devs.push(VirtioDeviceStub { dev: net_box, jail });
Stephen Barber308ff602018-02-13 22:47:07 -0800357 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800358 }
359 }
360
Zach Reizneraa575662018-08-15 10:46:32 -0700361 #[cfg(feature = "gpu")]
362 let mut resource_bridge_wl_socket: Option<
363 devices::virtio::resource_bridge::ResourceRequestSocket,
364 > = None;
Zach Reizner3a8100a2017-09-13 19:15:43 -0700365 #[cfg(feature = "gpu")]
366 {
367 if cfg.gpu {
David Rileyb22b6132018-08-20 08:11:42 -0700368 if let Some(wayland_socket_path) = cfg.wayland_socket_path.as_ref() {
Zach Reizneraa575662018-08-15 10:46:32 -0700369 let (wl_socket, gpu_socket) =
370 devices::virtio::resource_bridge::pair().map_err(Error::CreateSocket)?;
371 resource_bridge_wl_socket = Some(wl_socket);
372
David Rileyb22b6132018-08-20 08:11:42 -0700373 let jailed_wayland_path = Path::new("/wayland-0");
374
Zach Reizner55a9e502018-10-03 10:22:32 -0700375 let gpu_box = Box::new(devices::virtio::Gpu::new(
376 _exit_evt.try_clone().map_err(Error::CloneEventFd)?,
Zach Reizneraa575662018-08-15 10:46:32 -0700377 Some(gpu_socket),
Zach Reizner55a9e502018-10-03 10:22:32 -0700378 if cfg.multiprocess {
379 &jailed_wayland_path
380 } else {
381 wayland_socket_path.as_path()
382 },
383 ));
David Rileyb22b6132018-08-20 08:11:42 -0700384
385 let jail = if cfg.multiprocess {
386 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("gpu_device.policy");
387 let mut jail = create_base_minijail(empty_root_path, &policy_path)?;
388
389 // Create a tmpfs in the device's root directory so that we can bind mount the
390 // dri directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
Zach Reizner55a9e502018-10-03 10:22:32 -0700391 jail.mount_with_data(
392 Path::new("none"),
393 Path::new("/"),
394 "tmpfs",
395 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
396 "size=67108864",
David Tolnay2bac1e72018-12-12 14:33:42 -0800397 )
398 .unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700399
400 // Device nodes required for DRM.
401 let sys_dev_char_path = Path::new("/sys/dev/char");
402 jail.mount_bind(sys_dev_char_path, sys_dev_char_path, false)
403 .unwrap();
404 let sys_devices_path = Path::new("/sys/devices");
405 jail.mount_bind(sys_devices_path, sys_devices_path, false)
406 .unwrap();
407 let drm_dri_path = Path::new("/dev/dri");
Zach Reizner55a9e502018-10-03 10:22:32 -0700408 jail.mount_bind(drm_dri_path, drm_dri_path, false).unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700409
410 // Libraries that are required when mesa drivers are dynamically loaded.
411 let lib_path = Path::new("/lib64");
Zach Reizner55a9e502018-10-03 10:22:32 -0700412 jail.mount_bind(lib_path, lib_path, false).unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700413 let usr_lib_path = Path::new("/usr/lib64");
Zach Reizner55a9e502018-10-03 10:22:32 -0700414 jail.mount_bind(usr_lib_path, usr_lib_path, false).unwrap();
David Rileyb22b6132018-08-20 08:11:42 -0700415
416 // Bind mount the wayland socket into jail's root. This is necessary since each
417 // new wayland context must open() the socket.
418 jail.mount_bind(wayland_socket_path.as_path(), jailed_wayland_path, true)
419 .unwrap();
420
421 // Set the uid/gid for the jailed process, and give a basic id map. This
422 // is required for the above bind mount to work.
423 let crosvm_user_group = CStr::from_bytes_with_nul(b"crosvm\0").unwrap();
424 let crosvm_uid = match get_user_id(&crosvm_user_group) {
425 Ok(u) => u,
426 Err(e) => {
427 warn!("falling back to current user id for gpu: {:?}", e);
428 geteuid()
429 }
430 };
431 let crosvm_gid = match get_group_id(&crosvm_user_group) {
432 Ok(u) => u,
433 Err(e) => {
434 warn!("falling back to current group id for gpu: {:?}", e);
435 getegid()
436 }
437 };
438 jail.change_uid(crosvm_uid);
439 jail.change_gid(crosvm_gid);
440 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
441 .map_err(Error::SettingUidMap)?;
442 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
443 .map_err(Error::SettingGidMap)?;
444
445 Some(jail)
446 } else {
447 None
448 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700449 devs.push(VirtioDeviceStub { dev: gpu_box, jail });
David Rileyb22b6132018-08-20 08:11:42 -0700450 }
Zach Reizner3a8100a2017-09-13 19:15:43 -0700451 }
452 }
453
Zach Reizneraa575662018-08-15 10:46:32 -0700454 if let Some(wayland_socket_path) = cfg.wayland_socket_path.as_ref() {
455 let wayland_socket_dir = wayland_socket_path
456 .parent()
457 .ok_or(Error::InvalidWaylandPath)?;
458 let wayland_socket_name = wayland_socket_path
459 .file_name()
460 .ok_or(Error::InvalidWaylandPath)?;
461 let jailed_wayland_dir = Path::new("/wayland");
462 let jailed_wayland_path = jailed_wayland_dir.join(wayland_socket_name);
463
464 #[cfg(not(feature = "gpu"))]
465 let resource_bridge_wl_socket = None;
466
467 let wl_box = Box::new(
468 devices::virtio::Wl::new(
469 if cfg.multiprocess {
470 &jailed_wayland_path
471 } else {
472 wayland_socket_path.as_path()
473 },
474 wayland_device_socket,
475 resource_bridge_wl_socket,
David Tolnay2bac1e72018-12-12 14:33:42 -0800476 )
477 .map_err(Error::WaylandDeviceNew)?,
Zach Reizneraa575662018-08-15 10:46:32 -0700478 );
479
480 let jail = if cfg.multiprocess {
481 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("wl_device.policy");
482 let mut jail = create_base_minijail(empty_root_path, &policy_path)?;
483
484 // Create a tmpfs in the device's root directory so that we can bind mount the wayland
485 // socket directory into it. The size=67108864 is size=64*1024*1024 or size=64MB.
486 jail.mount_with_data(
487 Path::new("none"),
488 Path::new("/"),
489 "tmpfs",
490 (libc::MS_NOSUID | libc::MS_NODEV | libc::MS_NOEXEC) as usize,
491 "size=67108864",
David Tolnay2bac1e72018-12-12 14:33:42 -0800492 )
493 .unwrap();
Zach Reizneraa575662018-08-15 10:46:32 -0700494
495 // Bind mount the wayland socket's directory into jail's root. This is necessary since
496 // each new wayland context must open() the socket. If the wayland socket is ever
497 // destroyed and remade in the same host directory, new connections will be possible
498 // without restarting the wayland device.
499 jail.mount_bind(wayland_socket_dir, jailed_wayland_dir, true)
500 .unwrap();
501
502 // Set the uid/gid for the jailed process, and give a basic id map. This
503 // is required for the above bind mount to work.
504 let crosvm_user_group = CStr::from_bytes_with_nul(b"crosvm\0").unwrap();
505 let crosvm_uid = match get_user_id(&crosvm_user_group) {
506 Ok(u) => u,
507 Err(e) => {
508 warn!("falling back to current user id for Wayland: {:?}", e);
509 geteuid()
510 }
511 };
512 let crosvm_gid = match get_group_id(&crosvm_user_group) {
513 Ok(u) => u,
514 Err(e) => {
515 warn!("falling back to current group id for Wayland: {:?}", e);
516 getegid()
517 }
518 };
519 jail.change_uid(crosvm_uid);
520 jail.change_gid(crosvm_gid);
521 jail.uidmap(&format!("{0} {0} 1", crosvm_uid))
522 .map_err(Error::SettingUidMap)?;
523 jail.gidmap(&format!("{0} {0} 1", crosvm_gid))
524 .map_err(Error::SettingGidMap)?;
525
526 Some(jail)
527 } else {
528 None
529 };
530 devs.push(VirtioDeviceStub { dev: wl_box, jail });
531 }
532
533 if let Some(cid) = cfg.cid {
534 let vsock_box = Box::new(
535 devices::virtio::vhost::Vsock::new(cid, &mem).map_err(Error::VhostVsockDeviceNew)?,
536 );
537
538 let jail = if cfg.multiprocess {
539 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("vhost_vsock_device.policy");
540
541 Some(create_base_minijail(empty_root_path, &policy_path)?)
542 } else {
543 None
544 };
545
546 devs.push(VirtioDeviceStub {
547 dev: vsock_box,
548 jail,
549 });
550 }
551
Chirantan Ekboteebd56812018-04-16 19:32:04 -0700552 let chronos_user_group = CStr::from_bytes_with_nul(b"chronos\0").unwrap();
553 let chronos_uid = match get_user_id(&chronos_user_group) {
554 Ok(u) => u,
555 Err(e) => {
556 warn!("falling back to current user id for 9p: {:?}", e);
557 geteuid()
558 }
559 };
560 let chronos_gid = match get_group_id(&chronos_user_group) {
561 Ok(u) => u,
562 Err(e) => {
563 warn!("falling back to current group id for 9p: {:?}", e);
564 getegid()
565 }
566 };
567
568 for &(ref src, ref tag) in &cfg.shared_dirs {
569 let (jail, root) = if cfg.multiprocess {
570 let policy_path: PathBuf = cfg.seccomp_policy_dir.join("9p_device.policy");
571 let mut jail = create_base_minijail(empty_root_path, &policy_path)?;
572
573 // The shared directory becomes the root of the device's file system.
574 let root = Path::new("/");
575 jail.mount_bind(&src, root, true).unwrap();
576
577 // Set the uid/gid for the jailed process, and give a basic id map. This
578 // is required for the above bind mount to work.
579 jail.change_uid(chronos_uid);
580 jail.change_gid(chronos_gid);
581 jail.uidmap(&format!("{0} {0} 1", chronos_uid))
582 .map_err(Error::SettingUidMap)?;
583 jail.gidmap(&format!("{0} {0} 1", chronos_gid))
584 .map_err(Error::SettingGidMap)?;
585
586 (Some(jail), root)
587 } else {
588 // There's no bind mount so we tell the server to treat the source directory as the
589 // root. The double deref here converts |src| from a &PathBuf into a &Path.
590 (None, &**src)
591 };
592
593 let p9_box = Box::new(devices::virtio::P9::new(root, tag).map_err(Error::P9DeviceNew)?);
594
Zach Reizner55a9e502018-10-03 10:22:32 -0700595 devs.push(VirtioDeviceStub { dev: p9_box, jail });
Chirantan Ekboteebd56812018-04-16 19:32:04 -0700596 }
597
Daniel Verkamp8eceba32018-10-18 16:45:13 -0700598 let mut pci_devices: Vec<(Box<PciDevice + 'static>, Option<Minijail>)> = Vec::new();
Daniel Verkamp56f283b2018-10-05 11:40:59 -0700599 for stub in devs {
600 let pci_dev =
601 Box::new(VirtioPciDevice::new((*mem).clone(), stub.dev).map_err(Error::VirtioPciDev)?);
Daniel Verkamp8eceba32018-10-18 16:45:13 -0700602 pci_devices.push((pci_dev, stub.jail));
Daniel Verkamp56f283b2018-10-05 11:40:59 -0700603 }
604
605 Ok(pci_devices)
Sonny Raobb7da422018-02-13 20:37:48 -0800606}
607
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100608fn setup_vcpu_signal_handler() -> Result<()> {
609 unsafe {
610 extern "C" fn handle_signal() {}
611 // Our signal handler does nothing and is trivially async signal safe.
612 register_signal_handler(SIGRTMIN() + 0, handle_signal)
613 .map_err(Error::RegisterSignalHandler)?;
614 }
615 block_signal(SIGRTMIN() + 0).map_err(Error::BlockSignal)?;
616 Ok(())
617}
618
Zach Reizner55a9e502018-10-03 10:22:32 -0700619fn run_vcpu(
620 vcpu: Vcpu,
621 cpu_id: u32,
622 start_barrier: Arc<Barrier>,
623 io_bus: devices::Bus,
624 mmio_bus: devices::Bus,
625 exit_evt: EventFd,
626 kill_signaled: Arc<AtomicBool>,
627) -> Result<JoinHandle<()>> {
Zach Reizner8fb52112017-12-13 16:04:39 -0800628 thread::Builder::new()
629 .name(format!("crosvm_vcpu{}", cpu_id))
630 .spawn(move || {
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100631 let mut sig_ok = true;
632 match get_blocked_signals() {
633 Ok(mut v) => {
634 v.retain(|&x| x != SIGRTMIN() + 0);
635 if let Err(e) = vcpu.set_signal_mask(&v) {
636 error!(
637 "Failed to set the KVM_SIGNAL_MASK for vcpu {} : {:?}",
638 cpu_id, e
639 );
640 sig_ok = false;
641 }
642 }
643 Err(e) => {
644 error!(
645 "Failed to retrieve signal mask for vcpu {} : {:?}",
646 cpu_id, e
647 );
648 sig_ok = false;
649 }
650 };
Zach Reizner39aa26b2017-12-12 18:03:23 -0800651
Zach Reizner8fb52112017-12-13 16:04:39 -0800652 start_barrier.wait();
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100653
David Tolnay8f3a2322018-11-30 17:11:35 -0800654 if sig_ok {
655 loop {
656 match vcpu.run() {
657 Ok(VcpuExit::IoIn { port, mut size }) => {
658 let mut data = [0; 8];
659 if size > data.len() {
660 error!("unsupported IoIn size of {} bytes", size);
661 size = data.len();
Zach Reizner39aa26b2017-12-12 18:03:23 -0800662 }
David Tolnay8f3a2322018-11-30 17:11:35 -0800663 io_bus.read(port as u64, &mut data[..size]);
664 if let Err(e) = vcpu.set_data(&data[..size]) {
665 error!("failed to set return data for IoIn: {:?}", e);
Zach Reizner39aa26b2017-12-12 18:03:23 -0800666 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800667 }
David Tolnay8f3a2322018-11-30 17:11:35 -0800668 Ok(VcpuExit::IoOut {
669 port,
670 mut size,
671 data,
672 }) => {
673 if size > data.len() {
674 error!("unsupported IoOut size of {} bytes", size);
675 size = data.len();
676 }
677 io_bus.write(port as u64, &data[..size]);
678 }
679 Ok(VcpuExit::MmioRead { address, size }) => {
680 let mut data = [0; 8];
681 mmio_bus.read(address, &mut data[..size]);
682 // Setting data for mmio can not fail.
683 let _ = vcpu.set_data(&data[..size]);
684 }
685 Ok(VcpuExit::MmioWrite {
686 address,
687 size,
688 data,
689 }) => {
690 mmio_bus.write(address, &data[..size]);
691 }
692 Ok(VcpuExit::Hlt) => break,
693 Ok(VcpuExit::Shutdown) => break,
694 Ok(VcpuExit::SystemEvent(_, _)) =>
695 //TODO handle reboot and crash events
696 {
697 kill_signaled.store(true, Ordering::SeqCst)
698 }
699 Ok(r) => warn!("unexpected vcpu exit: {:?}", r),
700 Err(e) => match e.errno() {
701 libc::EAGAIN | libc::EINTR => {}
702 _ => {
703 error!("vcpu hit unknown error: {:?}", e);
704 break;
705 }
706 },
Zach Reizner39aa26b2017-12-12 18:03:23 -0800707 }
David Tolnay8f3a2322018-11-30 17:11:35 -0800708 if kill_signaled.load(Ordering::SeqCst) {
709 break;
710 }
Mark Ryan6ed5aea2018-04-20 13:52:35 +0100711
David Tolnay8f3a2322018-11-30 17:11:35 -0800712 // Try to clear the signal that we use to kick VCPU if it is
713 // pending before attempting to handle pause requests.
714 clear_signal(SIGRTMIN() + 0).expect("failed to clear pending signal");
715 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800716 }
Zach Reizner8fb52112017-12-13 16:04:39 -0800717 exit_evt
Zach Reizner39aa26b2017-12-12 18:03:23 -0800718 .write(1)
719 .expect("failed to signal vcpu exit eventfd");
David Tolnay2bac1e72018-12-12 14:33:42 -0800720 })
721 .map_err(Error::SpawnVcpu)
Zach Reizner39aa26b2017-12-12 18:03:23 -0800722}
723
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700724// Reads the contents of a file and converts them into a u64.
725fn file_to_u64<P: AsRef<Path>>(path: P) -> io::Result<u64> {
726 let mut file = File::open(path)?;
727
728 let mut buf = [0u8; 32];
729 let count = file.read(&mut buf)?;
730
Zach Reizner55a9e502018-10-03 10:22:32 -0700731 let content =
732 str::from_utf8(&buf[..count]).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
733 content
734 .trim()
735 .parse()
736 .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700737}
738
Dylan Reid059a1882018-07-23 17:58:09 -0700739pub fn run_config(cfg: Config) -> Result<()> {
Daniel Verkampaac28132018-10-15 14:58:48 -0700740 if cfg.multiprocess {
Dylan Reid059a1882018-07-23 17:58:09 -0700741 // Printing something to the syslog before entering minijail so that libc's syslogger has a
742 // chance to open files necessary for its operation, like `/etc/localtime`. After jailing,
743 // access to those files will not be possible.
744 info!("crosvm entering multiprocess mode");
745 }
746
Dylan Reid059a1882018-07-23 17:58:09 -0700747 // Masking signals is inherently dangerous, since this can persist across clones/execs. Do this
748 // before any jailed devices have been spawned, so that we can catch any of them that fail very
749 // quickly.
750 let sigchld_fd = SignalFd::new(libc::SIGCHLD).map_err(Error::CreateSignalFd)?;
751
752 let components = VmComponents {
Dylan Reid059a1882018-07-23 17:58:09 -0700753 memory_mb: (cfg.memory.unwrap_or(256) << 20) as u64,
754 vcpu_count: cfg.vcpu_count.unwrap_or(1),
755 kernel_image: File::open(cfg.kernel_path.as_path())
756 .map_err(|e| Error::OpenKernel(cfg.kernel_path.clone(), e))?,
Daniel Verkampaac28132018-10-15 14:58:48 -0700757 extra_kernel_params: cfg.params.clone(),
758 wayland_dmabuf: cfg.wayland_dmabuf,
Dylan Reid059a1882018-07-23 17:58:09 -0700759 };
760
761 let mut control_sockets = Vec::new();
762 if let Some(ref path_string) = cfg.socket_path {
763 let path = Path::new(path_string);
764 let dgram = UnixDatagram::bind(path).map_err(Error::CreateSocket)?;
Jingkui Wange13b1802018-10-03 13:04:47 -0700765 control_sockets.push(UnlinkMsgSocket::<VmResponse, VmRequest>::new(
766 UnlinkUnixDatagram(dgram),
767 ));
Dylan Reid059a1882018-07-23 17:58:09 -0700768 };
Zach Reizner55a9e502018-10-03 10:22:32 -0700769 let (wayland_host_socket, wayland_device_socket) =
770 UnixDatagram::pair().map_err(Error::CreateSocket)?;
Jingkui Wange13b1802018-10-03 13:04:47 -0700771 control_sockets.push(UnlinkMsgSocket::<VmResponse, VmRequest>::new(
772 UnlinkUnixDatagram(wayland_host_socket),
773 ));
Dylan Reid059a1882018-07-23 17:58:09 -0700774 // Balloon gets a special socket so balloon requests can be forwarded from the main process.
Zach Reizner55a9e502018-10-03 10:22:32 -0700775 let (balloon_host_socket, balloon_device_socket) =
776 UnixDatagram::pair().map_err(Error::CreateSocket)?;
Dylan Reid059a1882018-07-23 17:58:09 -0700777
Zach Reizner55a9e502018-10-03 10:22:32 -0700778 let linux = Arch::build_vm(components, |m, e| {
Daniel Verkampaac28132018-10-15 14:58:48 -0700779 create_virtio_devs(cfg, m, e, wayland_device_socket, balloon_device_socket)
David Tolnay2bac1e72018-12-12 14:33:42 -0800780 })
781 .map_err(Error::BuildingVm)?;
Dylan Reid059a1882018-07-23 17:58:09 -0700782 run_control(linux, control_sockets, balloon_host_socket, sigchld_fd)
Dylan Reid0ed91ab2018-05-31 15:42:18 -0700783}
784
Zach Reizner55a9e502018-10-03 10:22:32 -0700785fn run_control(
786 mut linux: RunnableLinuxVm,
Jingkui Wange13b1802018-10-03 13:04:47 -0700787 control_sockets: Vec<UnlinkMsgSocket<VmResponse, VmRequest>>,
Zach Reizner55a9e502018-10-03 10:22:32 -0700788 balloon_host_socket: UnixDatagram,
789 sigchld_fd: SignalFd,
790) -> Result<()> {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700791 // Paths to get the currently available memory and the low memory threshold.
David Tolnay5bbbf612018-12-01 17:49:30 -0800792 const LOWMEM_MARGIN: &str = "/sys/kernel/mm/chromeos-low_mem/margin";
793 const LOWMEM_AVAILABLE: &str = "/sys/kernel/mm/chromeos-low_mem/available";
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700794
795 // The amount of additional memory to claim back from the VM whenever the system is
796 // low on memory.
797 const ONE_GB: u64 = (1 << 30);
798
Dylan Reid0ed91ab2018-05-31 15:42:18 -0700799 let max_balloon_memory = match linux.vm.get_memory().memory_size() {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700800 // If the VM has at least 1.5 GB, the balloon driver can consume all but the last 1 GB.
801 n if n >= (ONE_GB / 2) * 3 => n - ONE_GB,
802 // Otherwise, if the VM has at least 500MB the balloon driver will consume at most
803 // half of it.
804 n if n >= (ONE_GB / 2) => n / 2,
805 // Otherwise, the VM is too small for us to take memory away from it.
806 _ => 0,
807 };
808 let mut current_balloon_memory: u64 = 0;
809 let balloon_memory_increment: u64 = max_balloon_memory / 16;
810
Zach Reizner5bed0d22018-03-28 02:31:11 -0700811 #[derive(PollToken)]
812 enum Token {
813 Exit,
814 Stdin,
815 ChildSignal,
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700816 CheckAvailableMemory,
817 LowMemory,
818 LowmemTimer,
Zach Reizner5bed0d22018-03-28 02:31:11 -0700819 VmControl { index: usize },
820 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800821
822 let stdin_handle = stdin();
823 let stdin_lock = stdin_handle.lock();
824 stdin_lock
825 .set_raw_mode()
826 .expect("failed to set terminal raw mode");
827
Zach Reizner5bed0d22018-03-28 02:31:11 -0700828 let poll_ctx = PollContext::new().map_err(Error::CreatePollContext)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700829 poll_ctx
830 .add(&linux.exit_evt, Token::Exit)
831 .map_err(Error::PollContextAdd)?;
Zach Reizner5bed0d22018-03-28 02:31:11 -0700832 if let Err(e) = poll_ctx.add(&stdin_handle, Token::Stdin) {
833 warn!("failed to add stdin to poll context: {:?}", e);
834 }
Zach Reizner55a9e502018-10-03 10:22:32 -0700835 poll_ctx
836 .add(&sigchld_fd, Token::ChildSignal)
837 .map_err(Error::PollContextAdd)?;
Dylan Reid059a1882018-07-23 17:58:09 -0700838 for (index, socket) in control_sockets.iter().enumerate() {
Zach Reizner55a9e502018-10-03 10:22:32 -0700839 poll_ctx
840 .add(socket.as_ref(), Token::VmControl { index })
841 .map_err(Error::PollContextAdd)?;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800842 }
843
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700844 // Watch for low memory notifications and take memory back from the VM.
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700845 let low_mem = File::open("/dev/chromeos-low-mem").ok();
846 if let Some(ref low_mem) = low_mem {
Zach Reizner55a9e502018-10-03 10:22:32 -0700847 poll_ctx
848 .add(low_mem, Token::LowMemory)
849 .map_err(Error::PollContextAdd)?;
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700850 } else {
851 warn!("Unable to open low mem indicator, maybe not a chrome os kernel");
852 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700853
854 // Used to rate limit balloon requests.
855 let mut lowmem_timer = TimerFd::new().map_err(Error::CreateTimerFd)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700856 poll_ctx
857 .add(&lowmem_timer, Token::LowmemTimer)
858 .map_err(Error::PollContextAdd)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700859
860 // Used to check whether it's ok to start giving memory back to the VM.
861 let mut freemem_timer = TimerFd::new().map_err(Error::CreateTimerFd)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700862 poll_ctx
863 .add(&freemem_timer, Token::CheckAvailableMemory)
864 .map_err(Error::PollContextAdd)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700865
866 // Used to add jitter to timer values so that we don't have a thundering herd problem when
867 // multiple VMs are running.
868 let mut rng = thread_rng();
869 let lowmem_jitter_ms = Range::new(0, 200);
870 let freemem_jitter_secs = Range::new(0, 12);
871 let interval_jitter_secs = Range::new(0, 6);
872
Daniel Verkamp37c4a782019-01-04 10:44:17 -0800873 let mut vcpu_handles = Vec::with_capacity(linux.vcpus.len());
874 let vcpu_thread_barrier = Arc::new(Barrier::new(linux.vcpus.len() + 1));
Dylan Reid059a1882018-07-23 17:58:09 -0700875 let kill_signaled = Arc::new(AtomicBool::new(false));
876 setup_vcpu_signal_handler()?;
877 for (cpu_id, vcpu) in linux.vcpus.into_iter().enumerate() {
Zach Reizner55a9e502018-10-03 10:22:32 -0700878 let handle = run_vcpu(
879 vcpu,
880 cpu_id as u32,
881 vcpu_thread_barrier.clone(),
882 linux.io_bus.clone(),
883 linux.mmio_bus.clone(),
884 linux.exit_evt.try_clone().map_err(Error::CloneEventFd)?,
885 kill_signaled.clone(),
886 )?;
Dylan Reid059a1882018-07-23 17:58:09 -0700887 vcpu_handles.push(handle);
888 }
889 vcpu_thread_barrier.wait();
890
Zach Reizner39aa26b2017-12-12 18:03:23 -0800891 'poll: loop {
Zach Reizner5bed0d22018-03-28 02:31:11 -0700892 let events = {
893 match poll_ctx.wait() {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800894 Ok(v) => v,
895 Err(e) => {
896 error!("failed to poll: {:?}", e);
897 break;
898 }
899 }
900 };
Zach Reizner5bed0d22018-03-28 02:31:11 -0700901 for event in events.iter_readable() {
902 match event.token() {
903 Token::Exit => {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800904 info!("vcpu requested shutdown");
905 break 'poll;
906 }
Zach Reizner5bed0d22018-03-28 02:31:11 -0700907 Token::Stdin => {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800908 let mut out = [0u8; 64];
909 match stdin_lock.read_raw(&mut out[..]) {
910 Ok(0) => {
911 // Zero-length read indicates EOF. Remove from pollables.
Zach Reizner5bed0d22018-03-28 02:31:11 -0700912 let _ = poll_ctx.delete(&stdin_handle);
Zach Reizner55a9e502018-10-03 10:22:32 -0700913 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800914 Err(e) => {
915 warn!("error while reading stdin: {:?}", e);
Zach Reizner5bed0d22018-03-28 02:31:11 -0700916 let _ = poll_ctx.delete(&stdin_handle);
Zach Reizner55a9e502018-10-03 10:22:32 -0700917 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800918 Ok(count) => {
Zach Reizner55a9e502018-10-03 10:22:32 -0700919 linux
920 .stdio_serial
Zach Reizner39aa26b2017-12-12 18:03:23 -0800921 .lock()
Zach Reizner39aa26b2017-12-12 18:03:23 -0800922 .queue_input_bytes(&out[..count])
923 .expect("failed to queue bytes into serial port");
Zach Reizner55a9e502018-10-03 10:22:32 -0700924 }
Zach Reizner39aa26b2017-12-12 18:03:23 -0800925 }
926 }
Zach Reizner5bed0d22018-03-28 02:31:11 -0700927 Token::ChildSignal => {
Zach Reizner39aa26b2017-12-12 18:03:23 -0800928 // Print all available siginfo structs, then exit the loop.
David Tolnayf5032762018-12-03 10:46:45 -0800929 while let Some(siginfo) = sigchld_fd.read().map_err(Error::SignalFd)? {
930 error!(
931 "child {} died: signo {}, status {}, code {}",
932 siginfo.ssi_pid,
933 siginfo.ssi_signo,
934 siginfo.ssi_status,
935 siginfo.ssi_code
936 );
Zach Reizner39aa26b2017-12-12 18:03:23 -0800937 }
David Tolnayf5032762018-12-03 10:46:45 -0800938 break 'poll;
Zach Reizner39aa26b2017-12-12 18:03:23 -0800939 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700940 Token::CheckAvailableMemory => {
941 // Acknowledge the timer.
942 freemem_timer.wait().map_err(Error::TimerFd)?;
943 if current_balloon_memory == 0 {
944 // Nothing to see here.
945 if let Err(e) = freemem_timer.clear() {
946 warn!("unable to clear available memory check timer: {}", e);
947 }
948 continue;
949 }
950
951 // Otherwise see if we can free up some memory.
952 let margin = file_to_u64(LOWMEM_MARGIN).map_err(Error::ReadLowmemMargin)?;
Zach Reizner55a9e502018-10-03 10:22:32 -0700953 let available =
954 file_to_u64(LOWMEM_AVAILABLE).map_err(Error::ReadLowmemAvailable)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700955
956 // `available` and `margin` are specified in MB while `balloon_memory_increment` is in
957 // bytes. So to correctly compare them we need to turn the increment value into MB.
Zach Reizner55a9e502018-10-03 10:22:32 -0700958 if available >= margin + 2 * (balloon_memory_increment >> 20) {
959 current_balloon_memory =
960 if current_balloon_memory >= balloon_memory_increment {
961 current_balloon_memory - balloon_memory_increment
962 } else {
963 0
964 };
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700965 let mut buf = [0u8; mem::size_of::<u64>()];
966 LittleEndian::write_u64(&mut buf, current_balloon_memory);
Dylan Reid059a1882018-07-23 17:58:09 -0700967 if let Err(e) = balloon_host_socket.send(&buf) {
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700968 warn!("failed to send memory value to balloon device: {}", e);
969 }
970 }
971 }
972 Token::LowMemory => {
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700973 if let Some(ref low_mem) = low_mem {
974 let old_balloon_memory = current_balloon_memory;
Zach Reizner55a9e502018-10-03 10:22:32 -0700975 current_balloon_memory = min(
976 current_balloon_memory + balloon_memory_increment,
977 max_balloon_memory,
978 );
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700979 if current_balloon_memory != old_balloon_memory {
980 let mut buf = [0u8; mem::size_of::<u64>()];
981 LittleEndian::write_u64(&mut buf, current_balloon_memory);
Dylan Reid059a1882018-07-23 17:58:09 -0700982 if let Err(e) = balloon_host_socket.send(&buf) {
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700983 warn!("failed to send memory value to balloon device: {}", e);
984 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -0700985 }
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700986
987 // Stop polling the lowmem device until the timer fires.
988 poll_ctx.delete(low_mem).map_err(Error::PollContextDelete)?;
989
990 // Add some jitter to the timer so that if there are multiple VMs running
991 // they don't all start ballooning at exactly the same time.
992 let lowmem_dur =
993 Duration::from_millis(1000 + lowmem_jitter_ms.ind_sample(&mut rng));
Zach Reizner55a9e502018-10-03 10:22:32 -0700994 lowmem_timer
995 .reset(lowmem_dur, None)
996 .map_err(Error::ResetTimerFd)?;
Dylan Reidf11e6ed2018-07-31 10:24:06 -0700997
998 // Also start a timer to check when we can start giving memory back. Do the
999 // first check after a minute (with jitter) and subsequent checks after
1000 // every 30 seconds (with jitter).
1001 let freemem_dur =
1002 Duration::from_secs(60 + freemem_jitter_secs.ind_sample(&mut rng));
1003 let freemem_int =
1004 Duration::from_secs(30 + interval_jitter_secs.ind_sample(&mut rng));
1005 freemem_timer
1006 .reset(freemem_dur, Some(freemem_int))
1007 .map_err(Error::ResetTimerFd)?;
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001008 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001009 }
1010 Token::LowmemTimer => {
1011 // Acknowledge the timer.
1012 lowmem_timer.wait().map_err(Error::TimerFd)?;
1013
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001014 if let Some(ref low_mem) = low_mem {
1015 // Start polling the lowmem device again.
Zach Reizner55a9e502018-10-03 10:22:32 -07001016 poll_ctx
1017 .add(low_mem, Token::LowMemory)
1018 .map_err(Error::PollContextAdd)?;
Dylan Reidf11e6ed2018-07-31 10:24:06 -07001019 }
Chirantan Ekbote448516e2018-07-24 16:07:42 -07001020 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001021 Token::VmControl { index } => {
Daniel Verkamp37c4a782019-01-04 10:44:17 -08001022 if let Some(socket) = control_sockets.get(index) {
Jingkui Wange13b1802018-10-03 13:04:47 -07001023 match socket.recv() {
Zach Reizner5bed0d22018-03-28 02:31:11 -07001024 Ok(request) => {
1025 let mut running = true;
Zach Reizner55a9e502018-10-03 10:22:32 -07001026 let response = request.execute(
1027 &mut linux.vm,
1028 &mut linux.resources,
1029 &mut running,
1030 &balloon_host_socket,
1031 );
Jingkui Wange13b1802018-10-03 13:04:47 -07001032 if let Err(e) = socket.send(&response) {
Zach Reizner5bed0d22018-03-28 02:31:11 -07001033 error!("failed to send VmResponse: {:?}", e);
1034 }
1035 if !running {
1036 info!("control socket requested exit");
1037 break 'poll;
1038 }
Zach Reizner39aa26b2017-12-12 18:03:23 -08001039 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001040 Err(e) => error!("failed to recv VmRequest: {:?}", e),
Zach Reizner39aa26b2017-12-12 18:03:23 -08001041 }
Zach Reizner39aa26b2017-12-12 18:03:23 -08001042 }
1043 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001044 }
1045 }
1046 for event in events.iter_hungup() {
1047 // It's possible more data is readable and buffered while the socket is hungup, so
1048 // don't delete the socket from the poll context until we're sure all the data is
1049 // read.
1050 if !event.readable() {
1051 match event.token() {
Zach Reizner55a9e502018-10-03 10:22:32 -07001052 Token::Exit => {}
Zach Reizner5bed0d22018-03-28 02:31:11 -07001053 Token::Stdin => {
1054 let _ = poll_ctx.delete(&stdin_handle);
Zach Reizner55a9e502018-10-03 10:22:32 -07001055 }
1056 Token::ChildSignal => {}
1057 Token::CheckAvailableMemory => {}
1058 Token::LowMemory => {}
1059 Token::LowmemTimer => {}
Zach Reizner5bed0d22018-03-28 02:31:11 -07001060 Token::VmControl { index } => {
Daniel Verkamp37c4a782019-01-04 10:44:17 -08001061 if let Some(socket) = control_sockets.get(index) {
Zach Reizner5bed0d22018-03-28 02:31:11 -07001062 let _ = poll_ctx.delete(socket.as_ref());
1063 }
Zach Reizner55a9e502018-10-03 10:22:32 -07001064 }
Zach Reizner5bed0d22018-03-28 02:31:11 -07001065 }
Zach Reizner39aa26b2017-12-12 18:03:23 -08001066 }
1067 }
1068 }
1069
1070 // vcpu threads MUST see the kill signaled flag, otherwise they may
1071 // re-enter the VM.
Dylan Reid059a1882018-07-23 17:58:09 -07001072 kill_signaled.store(true, Ordering::SeqCst);
1073 for handle in vcpu_handles {
Dmitry Torokhovcd405332018-02-16 16:25:54 -08001074 match handle.kill(SIGRTMIN() + 0) {
Zach Reizner39aa26b2017-12-12 18:03:23 -08001075 Ok(_) => {
1076 if let Err(e) = handle.join() {
1077 error!("failed to join vcpu thread: {:?}", e);
1078 }
1079 }
1080 Err(e) => error!("failed to kill vcpu thread: {:?}", e),
1081 }
1082 }
1083
1084 stdin_lock
1085 .set_canon_mode()
1086 .expect("failed to restore canonical mode for terminal");
1087
1088 Ok(())
1089}