Skip to content

Commit 5e0efad

Browse files
committed
fix: Retry KVM_CREATE_VM on EINTR
It is known that KVM_CREATE_VM fails with EINTR on heavily loaded machines with many VMs. It might be a kernel bug but apparently has not been fixed. To mitigate it, QEMU does an infinitely retry on EINTR. Similar, do retries up to 5 times. Signed-off-by: Takahiro Itazuri <[email protected]>
1 parent 8757b06 commit 5e0efad

File tree

3 files changed

+40
-2
lines changed

3 files changed

+40
-2
lines changed

src/vmm/src/vstate/vm/aarch64.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ pub enum ArchVmError {
3030
impl ArchVm {
3131
/// Create a new `Vm` struct.
3232
pub fn new(kvm: &Kvm) -> Result<ArchVm, VmError> {
33-
let fd = kvm.fd.create_vm().map_err(VmError::CreateVm)?;
33+
let fd = Self::create_vm(kvm)?;
3434
Ok(ArchVm {
3535
fd,
3636
irqchip_handle: None,

src/vmm/src/vstate/vm/mod.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ use kvm_bindings::{kvm_userspace_memory_region, KVM_MEM_LOG_DIRTY_PAGES};
99
use kvm_ioctls::VmFd;
1010
use vmm_sys_util::eventfd::EventFd;
1111

12+
use crate::logger::info;
1213
use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRegion};
1314

1415
#[cfg(target_arch = "x86_64")]
@@ -42,6 +43,42 @@ pub enum VmError {
4243

4344
/// Contains Vm functions that are usable across CPU architectures
4445
impl Vm {
46+
fn create_vm(kvm: &crate::vstate::kvm::Kvm) -> Result<VmFd, VmError> {
47+
// It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines
48+
// with many VMs.
49+
//
50+
// The behavior itself that KVM_CREATE_VM can return EINTR is intentional. This is because
51+
// the KVM_CREATE_VM path includes mm_take_all_locks() that is CPU intensive and all CPU
52+
// intensive syscalls should check for pending signals and return EINTR immediately to allow
53+
// userland to remain interactive.
54+
// https://lists.nongnu.org/archive/html/qemu-devel/2014-01/msg01740.html
55+
//
56+
// However, it is empirically confirmed that, even though there is no pending signal,
57+
// KVM_CREATE_VM returns EINTR.
58+
// https://lore.kernel.org/qemu-devel/[email protected]/
59+
//
60+
// To mitigate it, QEMU does an inifinite retry on EINTR that greatly improves reliabiliy:
61+
// - https://github.com/qemu/qemu/commit/94ccff133820552a859c0fb95e33a539e0b90a75
62+
// - https://github.com/qemu/qemu/commit/bbde13cd14ad4eec18529ce0bf5876058464e124
63+
//
64+
// Similarly, we do retries up to 5 times. Although Firecracker clients are also able to
65+
// retry, they have to start Firecracker from scratch. Doing retries in Firecracker makes
66+
// recovery faster and improves reliability.
67+
const MAX_ATTEMPTS: u32 = 5;
68+
for attempt in 1..=MAX_ATTEMPTS {
69+
match kvm.fd.create_vm() {
70+
Ok(fd) => return Ok(fd),
71+
Err(e) if e.errno() == libc::EINTR && attempt < MAX_ATTEMPTS => {
72+
info!("Attempt #{attempt} of KVM_CREATE_VM returned EINTR");
73+
// Exponential backoff (1us, 2us, 4us, and 8us => 15us in total)
74+
std::thread::sleep(std::time::Duration::from_micros(2u64.pow(attempt - 1)));
75+
}
76+
Err(e) => return Err(VmError::CreateVm(e)),
77+
}
78+
}
79+
unreachable!();
80+
}
81+
4582
/// Creates the specified number of [`Vcpu`]s.
4683
///
4784
/// The returned [`EventFd`] is written to whenever any of the vcpus exit.

src/vmm/src/vstate/vm/x86_64.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ pub struct ArchVm {
5353
impl ArchVm {
5454
/// Create a new `Vm` struct.
5555
pub fn new(kvm: &crate::vstate::kvm::Kvm) -> Result<ArchVm, VmError> {
56-
let fd = kvm.fd.create_vm().map_err(VmError::CreateVm)?;
56+
let fd = Self::create_vm(kvm)?;
57+
5758
let msrs_to_save = kvm.msrs_to_save().map_err(ArchVmError::GetMsrsToSave)?;
5859

5960
fd.set_tss_address(u64_to_usize(crate::arch::x86_64::layout::KVM_TSS_ADDRESS))

0 commit comments

Comments
 (0)