@@ -9,6 +9,7 @@ use kvm_bindings::{kvm_userspace_memory_region, KVM_MEM_LOG_DIRTY_PAGES};
9
9
use kvm_ioctls:: VmFd ;
10
10
use vmm_sys_util:: eventfd:: EventFd ;
11
11
12
+ use crate :: logger:: info;
12
13
use crate :: vstate:: memory:: { Address , GuestMemory , GuestMemoryMmap , GuestMemoryRegion } ;
13
14
14
15
#[ cfg( target_arch = "x86_64" ) ]
@@ -42,6 +43,42 @@ pub enum VmError {
42
43
43
44
/// Contains Vm functions that are usable across CPU architectures
44
45
impl Vm {
46
+ fn create_vm ( kvm : & crate :: vstate:: kvm:: Kvm ) -> Result < VmFd , VmError > {
47
+ // It is known that KVM_CREATE_VM occasionally fails with EINTR on heavily loaded machines
48
+ // with many VMs.
49
+ //
50
+ // The behavior itself that KVM_CREATE_VM can return EINTR is intentional. This is because
51
+ // the KVM_CREATE_VM path includes mm_take_all_locks() that is CPU intensive and all CPU
52
+ // intensive syscalls should check for pending signals and return EINTR immediately to allow
53
+ // userland to remain interactive.
54
+ // https://lists.nongnu.org/archive/html/qemu-devel/2014-01/msg01740.html
55
+ //
56
+ // However, it is empirically confirmed that, even though there is no pending signal,
57
+ // KVM_CREATE_VM returns EINTR.
58
+ // https://lore.kernel.org/qemu-devel/[email protected] /
59
+ //
60
+ // To mitigate it, QEMU does an inifinite retry on EINTR that greatly improves reliabiliy:
61
+ // - https://github.com/qemu/qemu/commit/94ccff133820552a859c0fb95e33a539e0b90a75
62
+ // - https://github.com/qemu/qemu/commit/bbde13cd14ad4eec18529ce0bf5876058464e124
63
+ //
64
+ // Similarly, we do retries up to 5 times. Although Firecracker clients are also able to
65
+ // retry, they have to start Firecracker from scratch. Doing retries in Firecracker makes
66
+ // recovery faster and improves reliability.
67
+ const MAX_ATTEMPTS : u32 = 5 ;
68
+ for attempt in 1 ..=MAX_ATTEMPTS {
69
+ match kvm. fd . create_vm ( ) {
70
+ Ok ( fd) => return Ok ( fd) ,
71
+ Err ( e) if e. errno ( ) == libc:: EINTR && attempt < MAX_ATTEMPTS => {
72
+ info ! ( "Attempt #{attempt} of KVM_CREATE_VM returned EINTR" ) ;
73
+ // Exponential backoff (1us, 2us, 4us, and 8us => 15us in total)
74
+ std:: thread:: sleep ( std:: time:: Duration :: from_micros ( 2u64 . pow ( attempt - 1 ) ) ) ;
75
+ }
76
+ Err ( e) => return Err ( VmError :: CreateVm ( e) ) ,
77
+ }
78
+ }
79
+ unreachable ! ( ) ;
80
+ }
81
+
45
82
/// Creates the specified number of [`Vcpu`]s.
46
83
///
47
84
/// The returned [`EventFd`] is written to whenever any of the vcpus exit.
0 commit comments