crosvm fix stale balloon stats
Stale balloon stats results can be returned from a stats request for the
following reasons:
* The initial stats buffer from the guest is posted to the
balloon_host_tube without a request.
* Balloon stats requests can fail because the balloon device isn't
completely set up yet; writing a stats request to the tube without
reading the response.
* Balloon stats requests can time out, returning an error. When the
balloon stats are eventually computed, they will be queued to the tube
without a read to consume them.
Possibly other reasons too.
This CL fixes this by adding an id to the balloon stats request. The id
is then returned with the computed stats. When consuming stats results
from the balloon_host_tube, we check that the ID is the one we expect,
if not, we keep reading from the tube until we do.
BUG=b:189282316
TEST=tast run dut multivm.Lifecycle.arc_host
Change-Id: I08e50196a45383b30c9e510b3bacbe32888aef80
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/platform/crosvm/+/3056310
Auto-Submit: Charles William Dick <cwd@google.com>
Tested-by: kokoro <noreply+kokoro@google.com>
Commit-Queue: Charles William Dick <cwd@google.com>
Reviewed-by: Chirantan Ekbote <chirantan@chromium.org>
Reviewed-by: Hikaru Nishida <hikalium@chromium.org>
diff --git a/devices/src/virtio/balloon.rs b/devices/src/virtio/balloon.rs
index 009cd56..ecd9a44 100644
--- a/devices/src/virtio/balloon.rs
+++ b/devices/src/virtio/balloon.rs
@@ -12,10 +12,10 @@
use remain::sorted;
use thiserror::Error as ThisError;
-use base::{self, error, info, warn, AsRawDescriptor, AsyncTube, Event, RawDescriptor, Tube};
+use base::{self, error, warn, AsRawDescriptor, AsyncTube, Event, RawDescriptor, Tube};
use cros_async::{select6, EventAsync, Executor};
use data_model::{DataInit, Le16, Le32, Le64};
-use vm_control::{BalloonControlCommand, BalloonControlResult, BalloonStats};
+use vm_control::{BalloonStats, BalloonTubeCommand, BalloonTubeResult};
use vm_memory::{GuestAddress, GuestMemory};
use super::{
@@ -190,12 +190,34 @@
mem: &GuestMemory,
mut queue: Queue,
mut queue_event: EventAsync,
- mut stats_rx: mpsc::Receiver<()>,
+ mut stats_rx: mpsc::Receiver<u64>,
command_tube: &Tube,
config: Arc<BalloonConfig>,
interrupt: Rc<RefCell<Interrupt>>,
) {
+ // Consume the first stats buffer sent from the guest at startup. It was not
+ // requested by anyone, and the stats are stale.
+ let mut index = match queue.next_async(mem, &mut queue_event).await {
+ Err(e) => {
+ error!("Failed to read descriptor {}", e);
+ return;
+ }
+ Ok(d) => d.index,
+ };
loop {
+ // Wait for a request to read the stats.
+ let id = match stats_rx.next().await {
+ Some(id) => id,
+ None => {
+ error!("stats signal tube was closed");
+ break;
+ }
+ };
+
+ // Request a new stats_desc to the guest.
+ queue.add_used(&mem, index, 0);
+ queue.trigger_interrupt(&mem, &*interrupt.borrow());
+
let stats_desc = match queue.next_async(mem, &mut queue_event).await {
Err(e) => {
error!("Failed to read descriptor {}", e);
@@ -203,7 +225,7 @@
}
Ok(d) => d,
};
- let index = stats_desc.index;
+ index = stats_desc.index;
let mut reader = match Reader::new(mem.clone(), stats_desc) {
Ok(r) => r,
Err(e) => {
@@ -222,23 +244,14 @@
};
}
let actual_pages = config.actual_pages.load(Ordering::Relaxed) as u64;
- let result = BalloonControlResult::Stats {
+ let result = BalloonTubeResult::Stats {
balloon_actual: actual_pages << VIRTIO_BALLOON_PFN_SHIFT,
stats,
+ id,
};
if let Err(e) = command_tube.send(&result) {
error!("failed to send stats result: {}", e);
}
-
- // Wait for a request to read the stats again.
- if stats_rx.next().await.is_none() {
- error!("stats signal tube was closed");
- break;
- }
-
- // Request a new stats_desc to the guest.
- queue.add_used(&mem, index, 0);
- queue.trigger_interrupt(&mem, &*interrupt.borrow());
}
}
@@ -248,20 +261,19 @@
command_tube: &AsyncTube,
interrupt: Rc<RefCell<Interrupt>>,
config: Arc<BalloonConfig>,
- mut stats_tx: mpsc::Sender<()>,
+ mut stats_tx: mpsc::Sender<u64>,
) -> Result<()> {
loop {
match command_tube.next().await {
Ok(command) => match command {
- BalloonControlCommand::Adjust { num_bytes } => {
+ BalloonTubeCommand::Adjust { num_bytes } => {
let num_pages = (num_bytes >> VIRTIO_BALLOON_PFN_SHIFT) as usize;
- info!("balloon config changed to consume {} pages", num_pages);
config.num_pages.store(num_pages, Ordering::Relaxed);
interrupt.borrow_mut().signal_config_changed();
}
- BalloonControlCommand::Stats => {
- if let Err(e) = stats_tx.try_send(()) {
+ BalloonTubeCommand::Stats { id } => {
+ if let Err(e) = stats_tx.try_send(id) {
error!("failed to signal the stat handler: {}", e);
}
}
@@ -347,8 +359,10 @@
);
pin_mut!(deflate);
- // The third queue is used for stats messages
- let (stats_tx, stats_rx) = mpsc::channel::<()>(1);
+ // The third queue is used for stats messages. The message type is the
+ // id of the stats request, so we can detect if there are any stale
+ // stats results that were queued during an error condition.
+ let (stats_tx, stats_rx) = mpsc::channel::<u64>(1);
let stats_event =
EventAsync::new(queue_evts.remove(0).0, &ex).expect("failed to set up the stats event");
let stats = handle_stats_queue(
diff --git a/src/error.rs b/src/error.rs
index 73446d2..2d6b198 100644
--- a/src/error.rs
+++ b/src/error.rs
@@ -35,7 +35,6 @@
AddPmemDeviceMemory(base::Error),
AllocateGpuDeviceAddress,
AllocatePmemDeviceAddress(resources::Error),
- BalloonActualTooLarge,
BalloonDeviceNew(virtio::BalloonError),
BlockDeviceNew(base::Error),
BlockSignal(base::signal::Error),
@@ -82,10 +81,6 @@
GenerateAcpi,
GetMaxOpenFiles(io::Error),
GetSignalMask(base::signal::Error),
- GuestCachedMissing(),
- GuestCachedTooLarge(std::num::TryFromIntError),
- GuestFreeMissing(),
- GuestFreeTooLarge(std::num::TryFromIntError),
GuestMemoryLayout(<Arch as LinuxArch>::Error),
#[cfg(all(target_arch = "x86_64", feature = "gdb"))]
HandleDebugCommand(<Arch as LinuxArch>::Error),
@@ -166,7 +161,6 @@
AllocatePmemDeviceAddress(e) => {
write!(f, "failed to allocate memory for pmem device: {}", e)
}
- BalloonActualTooLarge => write!(f, "balloon actual size is too large"),
BalloonDeviceNew(e) => write!(f, "failed to create balloon: {}", e),
BlockDeviceNew(e) => write!(f, "failed to create block device: {}", e),
BlockSignal(e) => write!(f, "failed to block signal: {}", e),
@@ -215,10 +209,6 @@
GenerateAcpi => write!(f, "failed to generate ACPI table"),
GetMaxOpenFiles(e) => write!(f, "failed to get max number of open files: {}", e),
GetSignalMask(e) => write!(f, "failed to retrieve signal mask for vcpu: {}", e),
- GuestCachedMissing() => write!(f, "guest cached is missing from balloon stats"),
- GuestCachedTooLarge(e) => write!(f, "guest cached is too large: {}", e),
- GuestFreeMissing() => write!(f, "guest free is missing from balloon stats"),
- GuestFreeTooLarge(e) => write!(f, "guest free is too large: {}", e),
GuestMemoryLayout(e) => write!(f, "failed to create guest memory layout: {}", e),
#[cfg(all(target_arch = "x86_64", feature = "gdb"))]
HandleDebugCommand(e) => write!(f, "failed to handle a gdb command: {}", e),
diff --git a/src/linux.rs b/src/linux.rs
index 1853833..f8f6d3c 100644
--- a/src/linux.rs
+++ b/src/linux.rs
@@ -2647,6 +2647,8 @@
vcpu_thread_barrier.wait();
+ let mut balloon_stats_id: u64 = 0;
+
'wait: loop {
let events = {
match wait_ctx.wait() {
@@ -2725,6 +2727,7 @@
let response = request.execute(
&mut run_mode_opt,
&balloon_host_tube,
+ &mut balloon_stats_id,
disk_host_tubes,
#[cfg(feature = "usb")]
Some(&usb_control_tube),
diff --git a/vm_control/src/lib.rs b/vm_control/src/lib.rs
index d52ce7c..0ecb659 100644
--- a/vm_control/src/lib.rs
+++ b/vm_control/src/lib.rs
@@ -103,6 +103,7 @@
/// require adding a big dependency for a single const.
pub const USB_CONTROL_MAX_PORTS: usize = 16;
+// Balloon commands that are sent on the crosvm control socket.
#[derive(Serialize, Deserialize, Debug)]
pub enum BalloonControlCommand {
/// Set the size of the VM's balloon.
@@ -112,6 +113,17 @@
Stats,
}
+// Balloon commands that are send on the balloon command tube.
+//
+// This is the same as BalloonControlCommand above, but includes an ID for the
+// Stats request so that we can discard stale stats if any previous stats
+// request failed or timed out.
+#[derive(Serialize, Deserialize, Debug)]
+pub enum BalloonTubeCommand {
+ Adjust { num_bytes: u64 },
+ Stats { id: u64 },
+}
+
// BalloonStats holds stats returned from the stats_queue.
#[derive(Default, Serialize, Deserialize, Debug)]
pub struct BalloonStats {
@@ -127,6 +139,7 @@
pub hugetlb_failures: Option<u64>,
}
+// BalloonControlResult holds results for BalloonControlCommand defined above.
#[derive(Serialize, Deserialize, Debug)]
pub enum BalloonControlResult {
Stats {
@@ -135,6 +148,18 @@
},
}
+// BalloonTubeResult are results to BalloonTubeCommand defined above. This is
+// the same as BalloonControlResult, but with an added ID so that we can detect
+// stale balloon stats values queued to the balloon command tube.
+#[derive(Serialize, Deserialize, Debug)]
+pub enum BalloonTubeResult {
+ Stats {
+ stats: BalloonStats,
+ balloon_actual: u64,
+ id: u64,
+ },
+}
+
#[derive(Serialize, Deserialize, Debug)]
pub enum DiskControlCommand {
/// Resize a disk to `new_size` in bytes.
@@ -944,6 +969,7 @@
&self,
run_mode: &mut Option<VmRunMode>,
balloon_host_tube: &Tube,
+ balloon_stats_id: &mut u64,
disk_host_tubes: &[Tube],
usb_control_tube: Option<&Tube>,
bat_control: &mut Option<BatControl>,
@@ -962,26 +988,50 @@
VmResponse::Ok
}
VmRequest::BalloonCommand(BalloonControlCommand::Adjust { num_bytes }) => {
- match balloon_host_tube.send(&BalloonControlCommand::Adjust { num_bytes }) {
+ match balloon_host_tube.send(&BalloonTubeCommand::Adjust { num_bytes }) {
Ok(_) => VmResponse::Ok,
Err(_) => VmResponse::Err(SysError::last()),
}
}
VmRequest::BalloonCommand(BalloonControlCommand::Stats) => {
- match balloon_host_tube.send(&BalloonControlCommand::Stats {}) {
- Ok(_) => match balloon_host_tube.recv() {
- Ok(BalloonControlResult::Stats {
- stats,
- balloon_actual,
- }) => VmResponse::BalloonStats {
- stats,
- balloon_actual,
- },
- Err(e) => {
- error!("balloon socket recv failed: {}", e);
- VmResponse::Err(SysError::last())
+ // NB: There are a few reasons stale balloon stats could be left
+ // in balloon_host_tube:
+ // - the send succeeds, but the recv fails because the device
+ // is not ready yet. So when the device is ready, there are
+ // extra stats requests queued.
+ // - the send succeed, but the recv times out. When the device
+ // does return the stats, there will be no consumer.
+ //
+ // To guard against this, add an `id` to the stats request. If
+ // the id returned to us doesn't match, we keep trying to read
+ // until it does.
+ *balloon_stats_id = (*balloon_stats_id).wrapping_add(1);
+ let sent_id = *balloon_stats_id;
+ match balloon_host_tube.send(&BalloonTubeCommand::Stats { id: sent_id }) {
+ Ok(_) => {
+ loop {
+ match balloon_host_tube.recv() {
+ Ok(BalloonTubeResult::Stats {
+ stats,
+ balloon_actual,
+ id,
+ }) => {
+ if sent_id != id {
+ // Keep trying to get the fresh stats.
+ continue;
+ }
+ break VmResponse::BalloonStats {
+ stats,
+ balloon_actual,
+ };
+ }
+ Err(e) => {
+ error!("balloon socket recv failed: {}", e);
+ break VmResponse::Err(SysError::last());
+ }
+ }
}
- },
+ }
Err(_) => VmResponse::Err(SysError::last()),
}
}