diff --git a/CHANGELOG.md b/CHANGELOG.md index 138ccd20079..00e82ee4b19 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,13 @@ and this project adheres to ### Added +- [#4987](https://github.com/firecracker-microvm/firecracker/pull/4987): Reset + physical counter register (`CNTPCT_EL0`) on VM startup. This avoids VM reading + the host physical counter value. This is only possible on 6.4 and newer + kernels. For older kernels physical counter will still be passed to the guest + unmodified. See more info + [here](https://github.com/firecracker-microvm/firecracker/blob/main/docs/prod-host-setup.md#arm-only-vm-physical-counter-behaviour) + ### Changed - [#4913](https://github.com/firecracker-microvm/firecracker/pull/4913): Removed diff --git a/docs/prod-host-setup.md b/docs/prod-host-setup.md index f046e31c735..cf0ac86cf1f 100644 --- a/docs/prod-host-setup.md +++ b/docs/prod-host-setup.md @@ -328,13 +328,16 @@ For vendor-specific recommendations, please consult the resources below: - ARM: [Speculative Processor Vulnerability](https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability) -##### [ARM only] Physical counter directly passed through to the guest +##### [ARM only] VM Physical counter behaviour -On ARM, the physical counter (i.e `CNTPCT`) it is returning the -[actual EL1 physical counter value of the host][1]. From the discussions before -merging this change [upstream][2], this seems like a conscious design decision -of the ARM code contributors, giving precedence to performance over the ability -to trap and control this in the hypervisor. +On ARM, Firecracker tries to reset the `CNTPCT` physical counter on VM boot. +This is done in order to prevent VM from reading host physical counter value. +Firecracker will only try to reset the counter if the host KVM contains +`KVM_CAP_COUNTER_OFFSET` capability. This capability is only present in kernels +containing +[this](https://lore.kernel.org/all/20230330174800.2677007-1-maz@kernel.org/) +patch series (starting from 6.4 and newer). For older kernels the counter value +will be passed through from the host. ##### Verification @@ -428,6 +431,3 @@ To validate that the change took effect, the file [^1]: Look for `GRUB_CMDLINE_LINUX` in file `/etc/default/grub` in RPM-based systems, and [this doc for Ubuntu](https://wiki.ubuntu.com/Kernel/KernelBootParameters). - -[1]: https://elixir.free-electrons.com/linux/v4.14.203/source/virt/kvm/arm/hyp/timer-sr.c#L63 -[2]: https://lists.cs.columbia.edu/pipermail/kvmarm/2017-January/023323.html diff --git a/src/vmm/src/arch/aarch64/regs.rs b/src/vmm/src/arch/aarch64/regs.rs index 5238f58ba70..d844fbfb56b 100644 --- a/src/vmm/src/arch/aarch64/regs.rs +++ b/src/vmm/src/arch/aarch64/regs.rs @@ -99,6 +99,12 @@ arm64_sys_reg!(SYS_CNTV_CVAL_EL0, 3, 3, 14, 3, 2); // https://elixir.bootlin.com/linux/v6.8/source/arch/arm64/include/asm/sysreg.h#L459 arm64_sys_reg!(SYS_CNTPCT_EL0, 3, 3, 14, 0, 1); +// Physical Timer EL0 count Register +// The id of this register is same as SYS_CNTPCT_EL0, but KVM defines it +// separately, so we do as well. +// https://elixir.bootlin.com/linux/v6.12.6/source/arch/arm64/include/uapi/asm/kvm.h#L259 +arm64_sys_reg!(KVM_REG_ARM_PTIMER_CNT, 3, 3, 14, 0, 1); + // Translation Table Base Register // https://developer.arm.com/documentation/ddi0595/2021-03/AArch64-Registers/TTBR1-EL1--Translation-Table-Base-Register-1--EL1- arm64_sys_reg!(TTBR1_EL1, 3, 0, 2, 0, 1); diff --git a/src/vmm/src/arch/aarch64/vcpu.rs b/src/vmm/src/arch/aarch64/vcpu.rs index 80fc5a339df..7b34ae91896 100644 --- a/src/vmm/src/arch/aarch64/vcpu.rs +++ b/src/vmm/src/arch/aarch64/vcpu.rs @@ -13,6 +13,7 @@ use kvm_ioctls::VcpuFd; use super::get_fdt_addr; use super::regs::*; +use crate::vstate::kvm::OptionalCapabilities; use crate::vstate::memory::GuestMemoryMmap; /// Errors thrown while setting aarch64 registers. @@ -78,6 +79,7 @@ pub fn setup_boot_regs( cpu_id: u8, boot_ip: u64, mem: &GuestMemoryMmap, + optional_capabilities: &OptionalCapabilities, ) -> Result<(), VcpuError> { let kreg_off = offset_of!(kvm_regs, regs); @@ -106,6 +108,23 @@ pub fn setup_boot_regs( vcpufd .set_one_reg(id, &get_fdt_addr(mem).to_le_bytes()) .map_err(|err| VcpuError::SetOneReg(id, err))?; + + // Reset the physical counter for the guest. This way we avoid guest reading + // host physical counter. + // Resetting KVM_REG_ARM_PTIMER_CNT for single vcpu is enough because there is only + // one timer struct with offsets per VM. + // Because the access to KVM_REG_ARM_PTIMER_CNT is only present starting 6.4 kernel, + // we only do the reset if KVM_CAP_COUNTER_OFFSET is present as it was added + // in the same patch series as the ability to set the KVM_REG_ARM_PTIMER_CNT register. + // Path series which introduced the needed changes: + // https://lore.kernel.org/all/20230330174800.2677007-1-maz@kernel.org/ + // Note: the value observed by the guest will still be above 0, because there is a delta + // time between this resetting and first call to KVM_RUN. + if optional_capabilities.counter_offset { + vcpufd + .set_one_reg(KVM_REG_ARM_PTIMER_CNT, &[0; 8]) + .map_err(|err| VcpuError::SetOneReg(id, err))?; + } } Ok(()) } @@ -214,20 +233,21 @@ pub fn set_mpstate(vcpufd: &VcpuFd, state: kvm_mp_state) -> Result<(), VcpuError #[cfg(test)] mod tests { #![allow(clippy::undocumented_unsafe_blocks)] - use kvm_ioctls::Kvm; use super::*; use crate::arch::aarch64::layout; use crate::test_utils::arch_mem; + use crate::vstate::kvm::Kvm; #[test] fn test_setup_regs() { - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mem = arch_mem(layout::FDT_MAX_SIZE + 0x1000); + let optional_capabilities = kvm.optional_capabilities(); - let res = setup_boot_regs(&vcpu, 0, 0x0, &mem); + let res = setup_boot_regs(&vcpu, 0, 0x0, &mem, &optional_capabilities); assert!(matches!( res.unwrap_err(), VcpuError::SetOneReg(0x6030000000100042, _) @@ -237,13 +257,31 @@ mod tests { vm.get_preferred_target(&mut kvi).unwrap(); vcpu.vcpu_init(&kvi).unwrap(); - setup_boot_regs(&vcpu, 0, 0x0, &mem).unwrap(); + setup_boot_regs(&vcpu, 0, 0x0, &mem, &optional_capabilities).unwrap(); + + // Check that the register is reset on compatible kernels. + // Because there is a delta in time between we reset the register and time we + // read it, we cannot compare with 0. Instead we compare it with meaningfully + // small value. + if optional_capabilities.counter_offset { + let mut reg_bytes = [0_u8; 8]; + vcpu.get_one_reg(SYS_CNTPCT_EL0, &mut reg_bytes).unwrap(); + let counter_value = u64::from_le_bytes(reg_bytes); + + // We are reading the SYS_CNTPCT_EL0 right after resetting it. + // If reset did happen successfully, the value should be quite small when we read it. + // If the reset did not happen, the value will be same as on the host and it surely + // will be more that MAX_VALUE. + let max_value = 1000; + + assert!(counter_value < max_value); + } } #[test] fn test_read_mpidr() { - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); @@ -261,8 +299,8 @@ mod tests { #[test] fn test_get_set_regs() { - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); @@ -283,8 +321,8 @@ mod tests { fn test_mpstate() { use std::os::unix::io::AsRawFd; - let kvm = Kvm::new().unwrap(); - let vm = kvm.create_vm().unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let vm = kvm.fd.create_vm().unwrap(); let vcpu = vm.create_vcpu(0).unwrap(); let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default(); vm.get_preferred_target(&mut kvi).unwrap(); diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 999e27415fc..4b43e67541f 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -68,6 +68,7 @@ use crate::utils::u64_to_usize; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::machine_config::{MachineConfig, MachineConfigError}; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemory, GuestMemoryMmap}; use crate::vstate::vcpu::{Vcpu, VcpuConfig, VcpuError}; use crate::vstate::vm::Vm; @@ -160,11 +161,17 @@ fn create_vmm_and_vcpus( ) -> Result<(Vmm, Vec), StartMicrovmError> { use self::StartMicrovmError::*; + let kvm = Kvm::new(kvm_capabilities) + .map_err(VmmError::Kvm) + .map_err(StartMicrovmError::Internal)?; // Set up Kvm Vm and register memory regions. // Build custom CPU config if a custom template is provided. - let mut vm = Vm::new(kvm_capabilities) + let mut vm = Vm::new(&kvm) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; + kvm.check_memory(&guest_memory) + .map_err(VmmError::Kvm) + .map_err(StartMicrovmError::Internal)?; vm.memory_init(&guest_memory, track_dirty_pages) .map_err(VmmError::Vm) .map_err(StartMicrovmError::Internal)?; @@ -186,7 +193,7 @@ fn create_vmm_and_vcpus( #[cfg(target_arch = "x86_64")] let (vcpus, pio_device_manager) = { setup_interrupt_controller(&mut vm)?; - let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; + let vcpus = create_vcpus(&kvm, &vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; // Make stdout non blocking. set_stdout_nonblocking(); @@ -218,7 +225,7 @@ fn create_vmm_and_vcpus( // Search for `kvm_arch_vcpu_create` in arch/arm/kvm/arm.c. #[cfg(target_arch = "aarch64")] let vcpus = { - let vcpus = create_vcpus(&vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; + let vcpus = create_vcpus(&kvm, &vm, vcpu_count, &vcpus_exit_evt).map_err(Internal)?; setup_interrupt_controller(&mut vm, vcpu_count)?; vcpus }; @@ -227,6 +234,7 @@ fn create_vmm_and_vcpus( events_observer: Some(std::io::stdin()), instance_info: instance_info.clone(), shutdown_exit_code: None, + kvm, vm, guest_memory, uffd, @@ -476,7 +484,7 @@ pub fn build_microvm_from_snapshot( uffd, vm_resources.machine_config.track_dirty_pages, vm_resources.machine_config.vcpu_count, - microvm_state.vm_state.kvm_cap_modifiers.clone(), + microvm_state.kvm_state.kvm_cap_modifiers.clone(), )?; #[cfg(target_arch = "x86_64")] @@ -738,11 +746,16 @@ fn attach_legacy_devices_aarch64( .map_err(VmmError::RegisterMMIODevice) } -fn create_vcpus(vm: &Vm, vcpu_count: u8, exit_evt: &EventFd) -> Result, VmmError> { +fn create_vcpus( + kvm: &Kvm, + vm: &Vm, + vcpu_count: u8, + exit_evt: &EventFd, +) -> Result, VmmError> { let mut vcpus = Vec::with_capacity(vcpu_count as usize); for cpu_idx in 0..vcpu_count { let exit_evt = exit_evt.try_clone().map_err(VmmError::EventFd)?; - let vcpu = Vcpu::new(cpu_idx, vm, exit_evt).map_err(VmmError::VcpuCreate)?; + let vcpu = Vcpu::new(cpu_idx, vm, kvm, exit_evt).map_err(VmmError::VcpuCreate)?; vcpus.push(vcpu); } Ok(vcpus) @@ -765,7 +778,7 @@ pub fn configure_system_for_boot( #[cfg(target_arch = "x86_64")] let cpu_config = { use crate::cpu_config::x86_64::cpuid; - let cpuid = cpuid::Cpuid::try_from(vmm.vm.supported_cpuid().clone()) + let cpuid = cpuid::Cpuid::try_from(vmm.kvm.supported_cpuid.clone()) .map_err(GuestConfigError::CpuidFromKvmCpuid)?; let msrs = vcpus[0] .kvm_vcpu @@ -801,16 +814,16 @@ pub fn configure_system_for_boot( cpu_config, }; - // Configure vCPUs with normalizing and setting the generated CPU configuration. - for vcpu in vcpus.iter_mut() { - vcpu.kvm_vcpu - .configure(vmm.guest_memory(), entry_addr, &vcpu_config) - .map_err(VmmError::VcpuConfigure) - .map_err(Internal)?; - } - #[cfg(target_arch = "x86_64")] { + // Configure vCPUs with normalizing and setting the generated CPU configuration. + for vcpu in vcpus.iter_mut() { + vcpu.kvm_vcpu + .configure(vmm.guest_memory(), entry_addr, &vcpu_config) + .map_err(VmmError::VcpuConfigure) + .map_err(Internal)?; + } + // Write the kernel command line to guest memory. This is x86_64 specific, since on // aarch64 the command line will be specified through the FDT. let cmdline_size = boot_cmdline @@ -845,6 +858,19 @@ pub fn configure_system_for_boot( } #[cfg(target_arch = "aarch64")] { + let optional_capabilities = vmm.kvm.optional_capabilities(); + // Configure vCPUs with normalizing and setting the generated CPU configuration. + for vcpu in vcpus.iter_mut() { + vcpu.kvm_vcpu + .configure( + vmm.guest_memory(), + entry_addr, + &vcpu_config, + &optional_capabilities, + ) + .map_err(VmmError::VcpuConfigure) + .map_err(Internal)?; + } let vcpu_mpidr = vcpus .iter_mut() .map(|cpu| cpu.kvm_vcpu.get_mpidr()) @@ -1111,7 +1137,8 @@ pub(crate) mod tests { .map_err(StartMicrovmError::Internal) .unwrap(); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).unwrap(); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let mmio_device_manager = MMIODeviceManager::new(); let acpi_device_manager = ACPIDeviceManager::new(); @@ -1137,7 +1164,7 @@ pub(crate) mod tests { #[cfg(target_arch = "aarch64")] { let exit_evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); - let _vcpu = Vcpu::new(1, &vm, exit_evt).unwrap(); + let _vcpu = Vcpu::new(1, &vm, &kvm, exit_evt).unwrap(); setup_interrupt_controller(&mut vm, 1).unwrap(); } @@ -1145,6 +1172,7 @@ pub(crate) mod tests { events_observer: Some(std::io::stdin()), instance_info: InstanceInfo::default(), shutdown_exit_code: None, + kvm, vm, guest_memory, uffd: None, @@ -1362,15 +1390,16 @@ pub(crate) mod tests { let vcpu_count = 2; let guest_memory = arch_mem(128 << 20); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); #[allow(unused_mut)] - let mut vm = Vm::new(vec![]).unwrap(); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_memory, false).unwrap(); let evfd = EventFd::new(libc::EFD_NONBLOCK).unwrap(); #[cfg(target_arch = "x86_64")] setup_interrupt_controller(&mut vm).unwrap(); - let vcpu_vec = create_vcpus(&vm, vcpu_count, &evfd).unwrap(); + let vcpu_vec = create_vcpus(&kvm, &vm, vcpu_count, &evfd).unwrap(); assert_eq!(vcpu_vec.len(), vcpu_count as usize); } diff --git a/src/vmm/src/device_manager/legacy.rs b/src/vmm/src/device_manager/legacy.rs index 45842d933b2..8526d3c2901 100644 --- a/src/vmm/src/device_manager/legacy.rs +++ b/src/vmm/src/device_manager/legacy.rs @@ -244,14 +244,11 @@ impl PortIODeviceManager { #[cfg(test)] mod tests { use super::*; - use crate::test_utils::single_region_mem; - use crate::Vm; + use crate::vstate::vm::tests::setup_vm_with_memory; #[test] fn test_register_legacy_devices() { - let guest_mem = single_region_mem(0x1000); - let mut vm = Vm::new(vec![]).unwrap(); - vm.memory_init(&guest_mem, false).unwrap(); + let (_, mut vm, _) = setup_vm_with_memory(0x1000); crate::builder::setup_interrupt_controller(&mut vm).unwrap(); let mut ldm = PortIODeviceManager::new( Arc::new(Mutex::new(BusDevice::Serial(SerialDevice { diff --git a/src/vmm/src/device_manager/mmio.rs b/src/vmm/src/device_manager/mmio.rs index 00c155abcfd..635bc1bc6e0 100644 --- a/src/vmm/src/device_manager/mmio.rs +++ b/src/vmm/src/device_manager/mmio.rs @@ -544,6 +544,7 @@ mod tests { use crate::devices::virtio::queue::Queue; use crate::devices::virtio::ActivateError; use crate::test_utils::multi_region_mem; + use crate::vstate::kvm::Kvm; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; use crate::{builder, Vm}; @@ -661,7 +662,8 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -690,7 +692,8 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mut device_manager = MMIODeviceManager::new(); let mut resource_allocator = ResourceAllocator::new().unwrap(); @@ -744,7 +747,8 @@ mod tests { let start_addr1 = GuestAddress(0x0); let start_addr2 = GuestAddress(0x1000); let guest_mem = multi_region_mem(&[(start_addr1, 0x1000), (start_addr2, 0x1000)]); - let mut vm = Vm::new(vec![]).unwrap(); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let mut vm = Vm::new(&kvm).unwrap(); vm.memory_init(&guest_mem, false).unwrap(); let mem_clone = guest_mem.clone(); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 77c0018c55a..6833a3a12d2 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -127,6 +127,7 @@ use userfaultfd::Uffd; use vmm_sys_util::epoll::EventSet; use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::terminal::Terminal; +use vstate::kvm::Kvm; use vstate::vcpu::{self, KvmVcpuConfigureError, StartThreadedError, VcpuSendEventError}; use crate::arch::DeviceType; @@ -255,6 +256,8 @@ pub enum VmmError { VcpuSpawn(io::Error), /// Vm error: {0} Vm(vstate::vm::VmError), + /// Kvm error: {0} + Kvm(vstate::kvm::KvmError), /// Error thrown by observer object on Vmm initialization: {0} VmmObserverInit(vmm_sys_util::errno::Error), /// Error thrown by observer object on Vmm teardown: {0} @@ -307,6 +310,7 @@ pub struct Vmm { shutdown_exit_code: Option, // Guest VM core resources. + kvm: Kvm, vm: Vm, guest_memory: GuestMemoryMmap, // Save UFFD in order to keep it open in the Firecracker process, as well. @@ -511,6 +515,7 @@ impl Vmm { pub fn save_state(&mut self, vm_info: &VmInfo) -> Result { use self::MicrovmStateError::SaveVmState; let vcpu_states = self.save_vcpu_states()?; + let kvm_state = self.kvm.save_state(); let vm_state = { #[cfg(target_arch = "x86_64")] { @@ -531,6 +536,7 @@ impl Vmm { Ok(MicrovmState { vm_info: vm_info.clone(), memory_state, + kvm_state, vm_state, vcpu_states, device_states, diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 621d95d1e87..c9aadad10a9 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -36,6 +36,7 @@ use crate::vmm_config::machine_config::{HugePageConfig, MachineConfigError, Mach use crate::vmm_config::snapshot::{ CreateSnapshotParams, LoadSnapshotParams, MemBackendType, SnapshotType, }; +use crate::vstate::kvm::KvmState; use crate::vstate::memory::{ GuestMemory, GuestMemoryExtension, GuestMemoryMmap, GuestMemoryState, MemoryError, }; @@ -77,6 +78,8 @@ pub struct MicrovmState { pub vm_info: VmInfo, /// Memory state. pub memory_state: GuestMemoryState, + /// KVM KVM state. + pub kvm_state: KvmState, /// VM KVM state. pub vm_state: VmState, /// Vcpu states. @@ -736,6 +739,7 @@ mod tests { device_states: states, memory_state, vcpu_states, + kvm_state: Default::default(), vm_info: VmInfo { mem_size_mib: 1u64, ..Default::default() diff --git a/src/vmm/src/vstate/kvm.rs b/src/vmm/src/vstate/kvm.rs new file mode 100644 index 00000000000..59b192dbe09 --- /dev/null +++ b/src/vmm/src/vstate/kvm.rs @@ -0,0 +1,221 @@ +// Copyright 2025 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use kvm_bindings::KVM_API_VERSION; +#[cfg(target_arch = "x86_64")] +use kvm_bindings::{CpuId, MsrList, KVM_MAX_CPUID_ENTRIES}; +use kvm_ioctls::Kvm as KvmFd; +use serde::{Deserialize, Serialize}; + +use crate::cpu_config::templates::KvmCapability; +use crate::vstate::memory::{GuestMemory, GuestMemoryMmap}; + +/// Errors associated with the wrappers over KVM ioctls. +/// Needs `rustfmt::skip` to make multiline comments work +#[rustfmt::skip] +#[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)] +pub enum KvmError { + /// The host kernel reports an invalid KVM API version: {0} + ApiVersion(i32), + /// Missing KVM capabilities: {0:#x?} + Capabilities(u32), + /** Error creating KVM object: {0} Make sure the user launching the firecracker process is \ + configured on the /dev/kvm file's ACL. */ + Kvm(kvm_ioctls::Error), + #[cfg(target_arch = "x86_64")] + /// Failed to get MSR index list to save into snapshots: {0} + GetMsrsToSave(crate::arch::x86_64::msr::MsrError), + #[cfg(target_arch = "x86_64")] + /// Failed to get supported cpuid: {0} + GetSupportedCpuId(kvm_ioctls::Error), + /// The number of configured slots is bigger than the maximum reported by KVM + NotEnoughMemorySlots, +} + +/// Struct with kvm fd and kvm associated paramenters. +#[derive(Debug)] +pub struct Kvm { + /// KVM fd. + pub fd: KvmFd, + /// Maximum number of memory slots allowed by KVM. + pub max_memslots: usize, + /// Additional capabilities that were specified in cpu template. + pub kvm_cap_modifiers: Vec, + + #[cfg(target_arch = "x86_64")] + /// Supported CpuIds. + pub supported_cpuid: CpuId, + #[cfg(target_arch = "x86_64")] + /// Msrs needed to be saved on snapshot creation. + pub msrs_to_save: MsrList, +} + +impl Kvm { + /// Create `Kvm` struct. + pub fn new(kvm_cap_modifiers: Vec) -> Result { + let kvm_fd = KvmFd::new().map_err(KvmError::Kvm)?; + + // Check that KVM has the correct version. + // Safe to cast because this is a constant. + #[allow(clippy::cast_possible_wrap)] + if kvm_fd.get_api_version() != KVM_API_VERSION as i32 { + return Err(KvmError::ApiVersion(kvm_fd.get_api_version())); + } + + let total_caps = Self::combine_capabilities(&kvm_cap_modifiers); + // Check that all desired capabilities are supported. + Self::check_capabilities(&kvm_fd, &total_caps).map_err(KvmError::Capabilities)?; + + let max_memslots = kvm_fd.get_nr_memslots(); + + #[cfg(target_arch = "aarch64")] + { + Ok(Self { + fd: kvm_fd, + max_memslots, + kvm_cap_modifiers, + }) + } + + #[cfg(target_arch = "x86_64")] + { + let supported_cpuid = kvm_fd + .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) + .map_err(KvmError::GetSupportedCpuId)?; + let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm_fd) + .map_err(KvmError::GetMsrsToSave)?; + + Ok(Kvm { + fd: kvm_fd, + max_memslots, + kvm_cap_modifiers, + supported_cpuid, + msrs_to_save, + }) + } + } + + /// Check guest memory does not have more regions than kvm allows. + pub fn check_memory(&self, guest_mem: &GuestMemoryMmap) -> Result<(), KvmError> { + if guest_mem.num_regions() > self.max_memslots { + Err(KvmError::NotEnoughMemorySlots) + } else { + Ok(()) + } + } + + fn combine_capabilities(kvm_cap_modifiers: &[KvmCapability]) -> Vec { + let mut total_caps = Self::DEFAULT_CAPABILITIES.to_vec(); + for modifier in kvm_cap_modifiers.iter() { + match modifier { + KvmCapability::Add(cap) => { + if !total_caps.contains(cap) { + total_caps.push(*cap); + } + } + KvmCapability::Remove(cap) => { + if let Some(pos) = total_caps.iter().position(|c| c == cap) { + total_caps.swap_remove(pos); + } + } + } + } + total_caps + } + + fn check_capabilities(kvm_fd: &KvmFd, capabilities: &[u32]) -> Result<(), u32> { + for cap in capabilities { + // If capability is not supported kernel will return 0. + if kvm_fd.check_extension_raw(u64::from(*cap)) == 0 { + return Err(*cap); + } + } + Ok(()) + } + + /// Saves and returns the Kvm state. + pub fn save_state(&self) -> KvmState { + KvmState { + kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), + } + } +} +#[cfg(target_arch = "aarch64")] +/// Optional capabilities. +#[derive(Debug, Default)] +pub struct OptionalCapabilities { + /// KVM_CAP_COUNTER_OFFSET + pub counter_offset: bool, +} +#[cfg(target_arch = "aarch64")] +impl Kvm { + const DEFAULT_CAPABILITIES: [u32; 7] = [ + kvm_bindings::KVM_CAP_IOEVENTFD, + kvm_bindings::KVM_CAP_IRQFD, + kvm_bindings::KVM_CAP_USER_MEMORY, + kvm_bindings::KVM_CAP_ARM_PSCI_0_2, + kvm_bindings::KVM_CAP_DEVICE_CTRL, + kvm_bindings::KVM_CAP_MP_STATE, + kvm_bindings::KVM_CAP_ONE_REG, + ]; + + /// Returns struct with optional capabilities statuses. + pub fn optional_capabilities(&self) -> OptionalCapabilities { + OptionalCapabilities { + counter_offset: self + .fd + .check_extension_raw(kvm_bindings::KVM_CAP_COUNTER_OFFSET.into()) + != 0, + } + } +} + +#[cfg(target_arch = "x86_64")] +impl Kvm { + const DEFAULT_CAPABILITIES: [u32; 14] = [ + kvm_bindings::KVM_CAP_IRQCHIP, + kvm_bindings::KVM_CAP_IOEVENTFD, + kvm_bindings::KVM_CAP_IRQFD, + kvm_bindings::KVM_CAP_USER_MEMORY, + kvm_bindings::KVM_CAP_SET_TSS_ADDR, + kvm_bindings::KVM_CAP_PIT2, + kvm_bindings::KVM_CAP_PIT_STATE2, + kvm_bindings::KVM_CAP_ADJUST_CLOCK, + kvm_bindings::KVM_CAP_DEBUGREGS, + kvm_bindings::KVM_CAP_MP_STATE, + kvm_bindings::KVM_CAP_VCPU_EVENTS, + kvm_bindings::KVM_CAP_XCRS, + kvm_bindings::KVM_CAP_XSAVE, + kvm_bindings::KVM_CAP_EXT_CPUID, + ]; +} + +/// Structure holding an general specific VM state. +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct KvmState { + /// Additional capabilities that were specified in cpu template. + pub kvm_cap_modifiers: Vec, +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + + #[test] + fn test_combine_capabilities() { + // Default caps for x86_64 and aarch64 both have KVM_CAP_IOEVENTFD and don't have + // KVM_CAP_IOMMU caps. + let additional_capabilities = vec![ + KvmCapability::Add(kvm_bindings::KVM_CAP_IOMMU), + KvmCapability::Remove(kvm_bindings::KVM_CAP_IOEVENTFD), + ]; + + let combined_caps = Kvm::combine_capabilities(&additional_capabilities); + assert!(combined_caps + .iter() + .any(|c| *c == kvm_bindings::KVM_CAP_IOMMU)); + assert!(!combined_caps + .iter() + .any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD)); + } +} diff --git a/src/vmm/src/vstate/mod.rs b/src/vmm/src/vstate/mod.rs index 32d7bd7ea7f..47458835e04 100644 --- a/src/vmm/src/vstate/mod.rs +++ b/src/vmm/src/vstate/mod.rs @@ -1,6 +1,8 @@ // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 +/// Module with Kvm implementation. +pub mod kvm; /// Module with GuestMemory implementation. pub mod memory; /// Module with Vcpu implementation. diff --git a/src/vmm/src/vstate/vcpu/aarch64.rs b/src/vmm/src/vstate/vcpu/aarch64.rs index 4e006f196d0..9db1173eeb8 100644 --- a/src/vmm/src/vstate/vcpu/aarch64.rs +++ b/src/vmm/src/vstate/vcpu/aarch64.rs @@ -22,6 +22,7 @@ use crate::cpu_config::aarch64::custom_cpu_template::VcpuFeatures; use crate::cpu_config::templates::CpuConfiguration; use crate::logger::{error, IncMetric, METRICS}; use crate::vcpu::{VcpuConfig, VcpuError}; +use crate::vstate::kvm::{Kvm, OptionalCapabilities}; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::VcpuEmulation; use crate::vstate::vm::Vm; @@ -77,7 +78,7 @@ impl KvmVcpu { /// /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. - pub fn new(index: u8, vm: &Vm) -> Result { + pub fn new(index: u8, vm: &Vm, _: &Kvm) -> Result { let kvm_vcpu = vm .fd() .create_vcpu(index.into()) @@ -115,6 +116,7 @@ impl KvmVcpu { guest_mem: &GuestMemoryMmap, kernel_load_addr: GuestAddress, vcpu_config: &VcpuConfig, + optional_capabilities: &OptionalCapabilities, ) -> Result<(), KvmVcpuError> { for reg in vcpu_config.cpu_config.regs.iter() { self.fd @@ -127,6 +129,7 @@ impl KvmVcpu { self.index, kernel_load_addr.raw_value(), guest_mem, + optional_capabilities, ) .map_err(KvmVcpuError::ConfigureRegisters)?; @@ -305,26 +308,27 @@ mod tests { use crate::cpu_config::aarch64::CpuConfiguration; use crate::cpu_config::templates::RegisterValueFilter; use crate::vcpu::VcpuConfig; + use crate::vstate::kvm::Kvm; use crate::vstate::memory::GuestMemoryMmap; - use crate::vstate::vm::tests::setup_vm; + use crate::vstate::vm::tests::setup_vm_with_memory; use crate::vstate::vm::Vm; - fn setup_vcpu(mem_size: usize) -> (Vm, KvmVcpu, GuestMemoryMmap) { - let (mut vm, vm_mem) = setup_vm(mem_size); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { + let (kvm, mut vm, vm_mem) = setup_vm_with_memory(mem_size); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vcpu.init(&[]).unwrap(); vm.setup_irqchip(1).unwrap(); - (vm, vcpu, vm_mem) + (kvm, vm, vcpu, vm_mem) } #[test] fn test_create_vcpu() { - let (vm, _) = setup_vm(0x1000); + let (kvm, vm, _) = setup_vm_with_memory(0x1000); unsafe { libc::close(vm.fd().as_raw_fd()) }; - let err = KvmVcpu::new(0, &vm); + let err = KvmVcpu::new(0, &vm, &kvm); assert_eq!( err.err().unwrap().to_string(), "Error creating vcpu: Bad file descriptor (os error 9)".to_string() @@ -336,7 +340,8 @@ mod tests { #[test] fn test_configure_vcpu() { - let (_vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let optional_capabilities = kvm.optional_capabilities(); let vcpu_config = VcpuConfig { vcpu_count: 1, @@ -347,6 +352,7 @@ mod tests { &vm_mem, GuestAddress(crate::arch::get_kernel_start()), &vcpu_config, + &optional_capabilities, ) .unwrap(); @@ -356,6 +362,7 @@ mod tests { &vm_mem, GuestAddress(crate::arch::get_kernel_start()), &vcpu_config, + &optional_capabilities, ); assert_eq!( err.unwrap_err(), @@ -371,8 +378,8 @@ mod tests { #[test] fn test_init_vcpu() { - let (mut vm, _vm_mem) = setup_vm(0x1000); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); // KVM_ARM_VCPU_PSCI_0_2 is set by default. @@ -390,8 +397,8 @@ mod tests { #[test] fn test_vcpu_save_restore_state() { - let (mut vm, _vm_mem) = setup_vm(0x1000); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); // Calling KVM_GET_REGLIST before KVM_VCPU_INIT will result in error. @@ -434,8 +441,8 @@ mod tests { // // This should fail with ENOEXEC. // https://elixir.bootlin.com/linux/v5.10.176/source/arch/arm64/kvm/arm.c#L1165 - let (mut vm, _vm_mem) = setup_vm(0x1000); - let vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu.dump_cpu_config().unwrap_err(); @@ -444,8 +451,8 @@ mod tests { #[test] fn test_dump_cpu_config_after_init() { // Test `dump_cpu_config()` after `KVM_VCPU_INIT`. - let (mut vm, _vm_mem) = setup_vm(0x1000); - let mut vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, mut vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu.init(&[]).unwrap(); @@ -454,10 +461,10 @@ mod tests { #[test] fn test_setup_non_boot_vcpu() { - let (vm, _) = setup_vm(0x1000); - let mut vcpu1 = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, vm, _) = setup_vm_with_memory(0x1000); + let mut vcpu1 = KvmVcpu::new(0, &vm, &kvm).unwrap(); vcpu1.init(&[]).unwrap(); - let mut vcpu2 = KvmVcpu::new(1, &vm).unwrap(); + let mut vcpu2 = KvmVcpu::new(1, &vm, &kvm).unwrap(); vcpu2.init(&[]).unwrap(); } @@ -466,7 +473,7 @@ mod tests { // Test `get_regs()` with valid register IDs. // - X0: 0x6030 0000 0010 0000 // - X1: 0x6030 0000 0010 0002 - let (_, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu, _) = setup_vcpu(0x10000); let reg_list = Vec::::from([0x6030000000100000, 0x6030000000100002]); get_registers(&vcpu.fd, ®_list, &mut Aarch64RegisterVec::default()).unwrap(); } @@ -474,7 +481,7 @@ mod tests { #[test] fn test_get_invalid_regs() { // Test `get_regs()` with invalid register IDs. - let (_, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu, _) = setup_vcpu(0x10000); let reg_list = Vec::::from([0x6030000000100001, 0x6030000000100003]); get_registers(&vcpu.fd, ®_list, &mut Aarch64RegisterVec::default()).unwrap_err(); } diff --git a/src/vmm/src/vstate/vcpu/mod.rs b/src/vmm/src/vstate/vcpu/mod.rs index ddfeda21b4c..6a6471193dc 100644 --- a/src/vmm/src/vstate/vcpu/mod.rs +++ b/src/vmm/src/vstate/vcpu/mod.rs @@ -44,6 +44,8 @@ pub use aarch64::{KvmVcpuError, *}; #[cfg(target_arch = "x86_64")] pub use x86_64::{KvmVcpuError, *}; +use super::kvm::Kvm; + /// Signal number (SIGRTMIN) used to kick Vcpus. pub const VCPU_RTSIG_OFFSET: i32 = 0; @@ -212,10 +214,10 @@ impl Vcpu { /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. /// * `exit_evt` - An `EventFd` that will be written into when this vcpu exits. - pub fn new(index: u8, vm: &Vm, exit_evt: EventFd) -> Result { + pub fn new(index: u8, vm: &Vm, kvm: &Kvm, exit_evt: EventFd) -> Result { let (event_sender, event_receiver) = channel(); let (response_sender, response_receiver) = channel(); - let kvm_vcpu = KvmVcpu::new(index, vm).unwrap(); + let kvm_vcpu = KvmVcpu::new(index, vm, kvm).unwrap(); Ok(Vcpu { exit_evt, @@ -777,13 +779,13 @@ pub(crate) mod tests { use crate::utils::signal::validate_signal_num; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::VcpuError as EmulationError; - use crate::vstate::vm::tests::setup_vm; + use crate::vstate::vm::tests::setup_vm_with_memory; use crate::vstate::vm::Vm; use crate::RECV_TIMEOUT_SEC; #[test] fn test_handle_kvm_exit() { - let (_vm, mut vcpu, _vm_mem) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _vm_mem) = setup_vcpu(0x1000); let res = handle_kvm_exit(&mut vcpu.kvm_vcpu.peripherals, Ok(VcpuExit::Hlt)); assert_eq!(res.unwrap(), VcpuEmulation::Stopped); @@ -918,14 +920,14 @@ pub(crate) mod tests { // Auxiliary function being used throughout the tests. #[allow(unused_mut)] - pub(crate) fn setup_vcpu(mem_size: usize) -> (Vm, Vcpu, GuestMemoryMmap) { - let (mut vm, gm) = setup_vm(mem_size); + pub(crate) fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, Vcpu, GuestMemoryMmap) { + let (kvm, mut vm, gm) = setup_vm_with_memory(mem_size); let exit_evt = EventFd::new(libc::EFD_NONBLOCK).unwrap(); #[cfg(target_arch = "aarch64")] let vcpu = { - let mut vcpu = Vcpu::new(1, &vm, exit_evt).unwrap(); + let mut vcpu = Vcpu::new(1, &vm, &kvm, exit_evt).unwrap(); vcpu.kvm_vcpu.init(&[]).unwrap(); vm.setup_irqchip(1).unwrap(); vcpu @@ -933,9 +935,9 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] let vcpu = { vm.setup_irqchip().unwrap(); - Vcpu::new(1, &vm, exit_evt).unwrap() + Vcpu::new(1, &vm, &kvm, exit_evt).unwrap() }; - (vm, vcpu, gm) + (kvm, vm, vcpu, gm) } fn load_good_kernel(vm_memory: &GuestMemoryMmap) -> GuestAddress { @@ -970,7 +972,7 @@ pub(crate) mod tests { Vcpu::register_kick_signal_handler(); // Need enough mem to boot linux. let mem_size = 64 << 20; - let (_vm, mut vcpu, vm_mem) = setup_vcpu(mem_size); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(mem_size); let vcpu_exit_evt = vcpu.exit_evt.try_clone().unwrap(); @@ -988,7 +990,7 @@ pub(crate) mod tests { vcpu_count: 1, smt: false, cpu_config: CpuConfiguration { - cpuid: Cpuid::try_from(_vm.supported_cpuid().clone()).unwrap(), + cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(), msrs: BTreeMap::new(), }, }, @@ -1006,6 +1008,7 @@ pub(crate) mod tests { smt: false, cpu_config: crate::cpu_config::aarch64::CpuConfiguration::default(), }, + &kvm.optional_capabilities(), ) .expect("failed to configure vcpu"); @@ -1022,7 +1025,7 @@ pub(crate) mod tests { #[test] fn test_set_mmio_bus() { - let (_, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_none()); vcpu.set_mmio_bus(crate::devices::Bus::new()); assert!(vcpu.kvm_vcpu.peripherals.mmio_bus.is_some()); @@ -1030,7 +1033,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_tls() { - let (_, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); // Running on the TLS vcpu should fail before we actually initialize it. unsafe { @@ -1061,7 +1064,7 @@ pub(crate) mod tests { #[test] fn test_invalid_tls() { - let (_, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); // Initialize vcpu TLS. vcpu.init_thread_local_data().unwrap(); // Trying to initialize non-empty TLS should error. @@ -1071,7 +1074,7 @@ pub(crate) mod tests { #[test] fn test_vcpu_kick() { Vcpu::register_kick_signal_handler(); - let (vm, mut vcpu, _) = setup_vcpu(0x1000); + let (_, vm, mut vcpu, _) = setup_vcpu(0x1000); let mut kvm_run = kvm_ioctls::KvmRunWrapper::mmap_from_fd(&vcpu.kvm_vcpu.fd, vm.fd().run_size()) @@ -1126,7 +1129,7 @@ pub(crate) mod tests { #[test] fn test_immediate_exit_shortcircuits_execution() { - let (_vm, mut vcpu, _) = setup_vcpu(0x1000); + let (_, _, mut vcpu, _) = setup_vcpu(0x1000); vcpu.kvm_vcpu.fd.set_kvm_immediate_exit(1); // Set a dummy value to be returned by the emulate call diff --git a/src/vmm/src/vstate/vcpu/x86_64.rs b/src/vmm/src/vstate/vcpu/x86_64.rs index 4043691130d..39ff0879ee8 100644 --- a/src/vmm/src/vstate/vcpu/x86_64.rs +++ b/src/vmm/src/vstate/vcpu/x86_64.rs @@ -23,6 +23,7 @@ use crate::arch::x86_64::msr::{create_boot_msr_entries, MsrError}; use crate::arch::x86_64::regs::{SetupFpuError, SetupRegistersError, SetupSpecialRegistersError}; use crate::cpu_config::x86_64::{cpuid, CpuConfiguration}; use crate::logger::{IncMetric, METRICS}; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{Address, GuestAddress, GuestMemoryMmap}; use crate::vstate::vcpu::{VcpuConfig, VcpuEmulation}; use crate::vstate::vm::Vm; @@ -164,7 +165,7 @@ impl KvmVcpu { /// /// * `index` - Represents the 0-based CPU index between [0, max vcpus). /// * `vm` - The vm to which this vcpu will get attached. - pub fn new(index: u8, vm: &Vm) -> Result { + pub fn new(index: u8, vm: &Vm, kvm: &Kvm) -> Result { let kvm_vcpu = vm .fd() .create_vcpu(index.into()) @@ -174,7 +175,7 @@ impl KvmVcpu { index, fd: kvm_vcpu, peripherals: Default::default(), - msrs_to_save: vm.msrs_to_save().as_slice().to_vec(), + msrs_to_save: kvm.msrs_to_save.as_slice().to_vec(), }) } @@ -716,7 +717,7 @@ mod tests { #![allow(clippy::undocumented_unsafe_blocks)] use kvm_bindings::kvm_msr_entry; - use kvm_ioctls::{Cap, Kvm}; + use kvm_ioctls::Cap; use super::*; use crate::arch::x86_64::cpu_model::CpuModel; @@ -725,7 +726,7 @@ mod tests { StaticCpuTemplate, }; use crate::cpu_config::x86_64::cpuid::{Cpuid, CpuidEntry, CpuidKey}; - use crate::vstate::vm::tests::setup_vm; + use crate::vstate::vm::tests::{setup_vm, setup_vm_with_memory}; use crate::vstate::vm::Vm; impl Default for VcpuState { @@ -746,11 +747,11 @@ mod tests { } } - fn setup_vcpu(mem_size: usize) -> (Vm, KvmVcpu, GuestMemoryMmap) { - let (vm, vm_mem) = setup_vm(mem_size); + fn setup_vcpu(mem_size: usize) -> (Kvm, Vm, KvmVcpu, GuestMemoryMmap) { + let (kvm, vm, vm_mem) = setup_vm_with_memory(mem_size); vm.setup_irqchip().unwrap(); - let vcpu = KvmVcpu::new(0, &vm).unwrap(); - (vm, vcpu, vm_mem) + let vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); + (kvm, vm, vcpu, vm_mem) } fn is_at_least_cascade_lake() -> bool { @@ -765,11 +766,11 @@ mod tests { } fn create_vcpu_config( - vm: &Vm, + kvm: &Kvm, vcpu: &KvmVcpu, template: &CustomCpuTemplate, ) -> Result { - let cpuid = Cpuid::try_from(vm.supported_cpuid().clone()) + let cpuid = Cpuid::try_from(kvm.supported_cpuid.clone()) .map_err(GuestConfigError::CpuidFromKvmCpuid)?; let msrs = vcpu .get_msrs(template.msr_index_iter()) @@ -785,19 +786,19 @@ mod tests { #[test] fn test_configure_vcpu() { - let (vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); - let vcpu_config = create_vcpu_config(&vm, &vcpu, &CustomCpuTemplate::default()).unwrap(); + let vcpu_config = create_vcpu_config(&kvm, &vcpu, &CustomCpuTemplate::default()).unwrap(); assert_eq!( vcpu.configure(&vm_mem, GuestAddress(0), &vcpu_config,), Ok(()) ); - let try_configure = |vm: &Vm, vcpu: &mut KvmVcpu, template| -> bool { + let try_configure = |kvm: &Kvm, vcpu: &mut KvmVcpu, template| -> bool { let cpu_template = Some(CpuTemplateType::Static(template)); let template = cpu_template.get_cpu_template(); match template { - Ok(template) => match create_vcpu_config(vm, vcpu, &template) { + Ok(template) => match create_vcpu_config(kvm, vcpu, &template) { Ok(config) => vcpu .configure( &vm_mem, @@ -812,19 +813,19 @@ mod tests { }; // Test configure while using the T2 template. - let t2_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2); + let t2_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2); // Test configure while using the C3 template. - let c3_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::C3); + let c3_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::C3); // Test configure while using the T2S template. - let t2s_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2S); + let t2s_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2S); // Test configure while using the T2CL template. - let t2cl_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2CL); + let t2cl_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2CL); // Test configure while using the T2S template. - let t2a_res = try_configure(&vm, &mut vcpu, StaticCpuTemplate::T2A); + let t2a_res = try_configure(&kvm, &mut vcpu, StaticCpuTemplate::T2A); match &cpuid::common::get_vendor_id_from_host().unwrap() { cpuid::VENDOR_ID_INTEL => { @@ -857,8 +858,8 @@ mod tests { #[test] fn test_vcpu_cpuid_restore() { - let (vm, vcpu, _mem) = setup_vcpu(0x10000); - vcpu.fd.set_cpuid2(vm.supported_cpuid()).unwrap(); + let (kvm, _, vcpu, _mem) = setup_vcpu(0x10000); + vcpu.fd.set_cpuid2(&kvm.supported_cpuid).unwrap(); // Mutate the CPUID. // Leaf 0x3 / EAX that is an unused (reserved to be accurate) register, so it's harmless. @@ -875,7 +876,7 @@ mod tests { drop(vcpu); // Restore the state into a new vcpu. - let (_vm, vcpu, _mem) = setup_vcpu(0x10000); + let (_, _vm, vcpu, _mem) = setup_vcpu(0x10000); let result2 = vcpu.restore_state(&state); assert!(result2.is_ok(), "{}", result2.unwrap_err()); @@ -895,12 +896,12 @@ mod tests { #[test] fn test_empty_cpuid_entries_removed() { // Test that `get_cpuid()` removes zeroed empty entries from the `KVM_GET_CPUID2` result. - let (vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, smt: false, cpu_config: CpuConfiguration { - cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(), + cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(), msrs: BTreeMap::new(), }, }; @@ -946,7 +947,7 @@ mod tests { // Since `KVM_SET_CPUID2` has not been called before vcpu configuration, all leaves should // be filled with zero. Therefore, `KvmVcpu::dump_cpu_config()` should fail with CPUID type // conversion error due to the lack of brand string info in leaf 0x0. - let (_, vcpu, _) = setup_vcpu(0x10000); + let (_, _, vcpu, _) = setup_vcpu(0x10000); match vcpu.dump_cpu_config() { Err(KvmVcpuError::ConvertCpuidType(_)) => (), Err(err) => panic!("Unexpected error: {err}"), @@ -957,12 +958,12 @@ mod tests { #[test] fn test_dump_cpu_config_with_configured_vcpu() { // Test `dump_cpu_config()` after vcpu configuration. - let (vm, mut vcpu, vm_mem) = setup_vcpu(0x10000); + let (kvm, _, mut vcpu, vm_mem) = setup_vcpu(0x10000); let vcpu_config = VcpuConfig { vcpu_count: 1, smt: false, cpu_config: CpuConfiguration { - cpuid: Cpuid::try_from(vm.supported_cpuid().clone()).unwrap(), + cpuid: Cpuid::try_from(kvm.supported_cpuid.clone()).unwrap(), msrs: BTreeMap::new(), }, }; @@ -976,7 +977,7 @@ mod tests { fn test_is_tsc_scaling_required() { // Test `is_tsc_scaling_required` as if it were on the same // CPU model as the one in the snapshot state. - let (_vm, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); { // The frequency difference is within tolerance. @@ -1015,7 +1016,7 @@ mod tests { #[test] fn test_set_tsc() { - let (vm, vcpu, _) = setup_vcpu(0x1000); + let (kvm, _, vcpu, _) = setup_vcpu(0x1000); let mut state = vcpu.save_state().unwrap(); state.tsc_khz = Some( state.tsc_khz.unwrap() @@ -1024,9 +1025,9 @@ mod tests { * 2, ); - if vm.fd().check_extension(Cap::TscControl) { + if kvm.fd.check_extension(Cap::TscControl) { vcpu.set_tsc_khz(state.tsc_khz.unwrap()).unwrap(); - if vm.fd().check_extension(Cap::GetTscKhz) { + if kvm.fd.check_extension(Cap::GetTscKhz) { assert_eq!(vcpu.get_tsc_khz().ok(), state.tsc_khz); } else { vcpu.get_tsc_khz().unwrap_err(); @@ -1040,7 +1041,7 @@ mod tests { fn test_get_msrs_with_msrs_to_save() { // Test `get_msrs()` with the MSR indices that should be serialized into snapshots. // The MSR indices should be valid and this test should succeed. - let (_, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); vcpu.get_msrs(vcpu.msrs_to_save.iter().copied()).unwrap(); } @@ -1048,7 +1049,7 @@ mod tests { fn test_get_msrs_with_msrs_to_dump() { // Test `get_msrs()` with the MSR indices that should be dumped. // All the MSR indices should be valid and the call should succeed. - let (_, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); let kvm = kvm_ioctls::Kvm::new().unwrap(); let msrs_to_dump = crate::arch::x86_64::msr::get_msrs_to_dump(&kvm).unwrap(); @@ -1061,7 +1062,7 @@ mod tests { // Test `get_msrs()` with unsupported MSR indices. This should return `VcpuGetMsr` error // that happens when `KVM_GET_MSRS` fails to populate MSR values in the middle and exits. // Currently, MSR indices 2..=4 are not listed as supported MSRs. - let (_, vcpu, _) = setup_vcpu(0x1000); + let (_, _, vcpu, _) = setup_vcpu(0x1000); let msr_index_list: Vec = vec![2, 3, 4]; match vcpu.get_msrs(msr_index_list.iter().copied()) { Err(KvmVcpuError::VcpuGetMsr(_)) => (), @@ -1167,13 +1168,11 @@ mod tests { #[test] fn test_get_msr_chunks_preserved_order() { // Regression test for #4666 - - let kvm = Kvm::new().unwrap(); - let vm = Vm::new(Vec::new()).unwrap(); - let vcpu = KvmVcpu::new(0, &vm).unwrap(); + let (kvm, vm) = setup_vm(); + let vcpu = KvmVcpu::new(0, &vm, &kvm).unwrap(); // The list of supported MSR indices, in the order they were returned by KVM - let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm).unwrap(); + let msrs_to_save = kvm.msrs_to_save; // The MSRs after processing. The order should be identical to the one returned by KVM, with // the exception of deferred MSRs, which should be moved to the end (but show up in the same // order as they are listed in [`DEFERRED_MSRS`]. diff --git a/src/vmm/src/vstate/vm.rs b/src/vmm/src/vstate/vm.rs index d213d4d7bb6..1bcf191b8b9 100644 --- a/src/vmm/src/vstate/vm.rs +++ b/src/vmm/src/vstate/vm.rs @@ -10,21 +10,21 @@ use std::fmt; #[cfg(target_arch = "x86_64")] use kvm_bindings::{ - kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, CpuId, MsrList, - KVM_CLOCK_TSC_STABLE, KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, - KVM_MAX_CPUID_ENTRIES, KVM_PIT_SPEAKER_DUMMY, + kvm_clock_data, kvm_irqchip, kvm_pit_config, kvm_pit_state2, KVM_CLOCK_TSC_STABLE, + KVM_IRQCHIP_IOAPIC, KVM_IRQCHIP_PIC_MASTER, KVM_IRQCHIP_PIC_SLAVE, KVM_PIT_SPEAKER_DUMMY, }; -use kvm_bindings::{kvm_userspace_memory_region, KVM_API_VERSION, KVM_MEM_LOG_DIRTY_PAGES}; -use kvm_ioctls::{Kvm, VmFd}; +use kvm_bindings::{kvm_userspace_memory_region, KVM_MEM_LOG_DIRTY_PAGES}; +// use kvm_ioctls::{Kvm, VmFd}; +use kvm_ioctls::VmFd; use serde::{Deserialize, Serialize}; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GICDevice; #[cfg(target_arch = "aarch64")] use crate::arch::aarch64::gic::GicState; -use crate::cpu_config::templates::KvmCapability; #[cfg(target_arch = "x86_64")] use crate::utils::u64_to_usize; +use crate::vstate::kvm::Kvm; use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; /// Errors associated with the wrappers over KVM ioctls. @@ -32,18 +32,6 @@ use crate::vstate::memory::{Address, GuestMemory, GuestMemoryMmap, GuestMemoryRe #[rustfmt::skip] #[derive(Debug, PartialEq, Eq, thiserror::Error, displaydoc::Display)] pub enum VmError { - /// The host kernel reports an invalid KVM API version: {0} - ApiVersion(i32), - /// Missing KVM capabilities: {0:#x?} - Capabilities(u32), - /** Error creating KVM object: {0} Make sure the user launching the firecracker process is \ - configured on the /dev/kvm file's ACL. */ - Kvm(kvm_ioctls::Error), - #[cfg(target_arch = "x86_64")] - /// Failed to get MSR index list to save into snapshots: {0} - GetMsrsToSave(#[from] crate::arch::x86_64::msr::MsrError), - /// The number of configured slots is bigger than the maximum reported by KVM - NotEnoughMemorySlots, /// Cannot set the memory regions: {0} SetUserMemoryRegion(kvm_ioctls::Error), #[cfg(target_arch = "aarch64")] @@ -112,16 +100,6 @@ pub enum RestoreStateError { #[derive(Debug)] pub struct Vm { fd: VmFd, - max_memslots: usize, - - /// Additional capabilities that were specified in cpu template. - pub kvm_cap_modifiers: Vec, - - // X86 specific fields. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - supported_cpuid: CpuId, - #[cfg(target_arch = "x86_64")] - msrs_to_save: MsrList, // Arm specific fields. // On aarch64 we need to keep around the fd obtained by creating the VGIC device. @@ -131,79 +109,23 @@ pub struct Vm { /// Contains Vm functions that are usable across CPU architectures impl Vm { - /// Constructs a new `Vm` using the given `Kvm` instance. - pub fn new(kvm_cap_modifiers: Vec) -> Result { - let kvm = Kvm::new().map_err(VmError::Kvm)?; - - // Check that KVM has the correct version. - // Safe to cast because this is a constant. - #[allow(clippy::cast_possible_wrap)] - if kvm.get_api_version() != KVM_API_VERSION as i32 { - return Err(VmError::ApiVersion(kvm.get_api_version())); - } - - let total_caps = Self::combine_capabilities(&kvm_cap_modifiers); - // Check that all desired capabilities are supported. - Self::check_capabilities(&kvm, &total_caps).map_err(VmError::Capabilities)?; - - let max_memslots = kvm.get_nr_memslots(); + /// Create a new `Vm` struct. + pub fn new(kvm: &Kvm) -> Result { // Create fd for interacting with kvm-vm specific functions. - let vm_fd = kvm.create_vm().map_err(VmError::VmFd)?; + let vm_fd = kvm.fd.create_vm().map_err(VmError::VmFd)?; #[cfg(target_arch = "aarch64")] { Ok(Vm { fd: vm_fd, - max_memslots, - kvm_cap_modifiers, irqchip_handle: None, }) } #[cfg(target_arch = "x86_64")] { - let supported_cpuid = kvm - .get_supported_cpuid(KVM_MAX_CPUID_ENTRIES) - .map_err(VmError::VmFd)?; - let msrs_to_save = crate::arch::x86_64::msr::get_msrs_to_save(&kvm)?; - - Ok(Vm { - fd: vm_fd, - max_memslots, - kvm_cap_modifiers, - supported_cpuid, - msrs_to_save, - }) - } - } - - fn combine_capabilities(kvm_cap_modifiers: &[KvmCapability]) -> Vec { - let mut total_caps = Self::DEFAULT_CAPABILITIES.to_vec(); - for modifier in kvm_cap_modifiers.iter() { - match modifier { - KvmCapability::Add(cap) => { - if !total_caps.iter().any(|c| c == cap) { - total_caps.push(*cap); - } - } - KvmCapability::Remove(cap) => { - if let Some(pos) = total_caps.iter().position(|c| c == cap) { - total_caps.remove(pos); - } - } - } + Ok(Vm { fd: vm_fd }) } - total_caps - } - - fn check_capabilities(kvm: &Kvm, capabilities: &[u32]) -> Result<(), u32> { - for cap in capabilities { - // If capability is not supported kernel will return 0. - if kvm.check_extension_raw(u64::from(*cap)) == 0 { - return Err(*cap); - } - } - Ok(()) } /// Initializes the guest memory. @@ -212,9 +134,6 @@ impl Vm { guest_mem: &GuestMemoryMmap, track_dirty_pages: bool, ) -> Result<(), VmError> { - if guest_mem.num_regions() > self.max_memslots { - return Err(VmError::NotEnoughMemorySlots); - } self.set_kvm_memory_regions(guest_mem, track_dirty_pages)?; #[cfg(target_arch = "x86_64")] self.fd @@ -261,16 +180,6 @@ impl Vm { #[cfg(target_arch = "aarch64")] impl Vm { - const DEFAULT_CAPABILITIES: [u32; 7] = [ - kvm_bindings::KVM_CAP_IOEVENTFD, - kvm_bindings::KVM_CAP_IRQFD, - kvm_bindings::KVM_CAP_USER_MEMORY, - kvm_bindings::KVM_CAP_ARM_PSCI_0_2, - kvm_bindings::KVM_CAP_DEVICE_CTRL, - kvm_bindings::KVM_CAP_MP_STATE, - kvm_bindings::KVM_CAP_ONE_REG, - ]; - /// Creates the GIC (Global Interrupt Controller). pub fn setup_irqchip(&mut self, vcpu_count: u8) -> Result<(), VmError> { self.irqchip_handle = Some( @@ -292,7 +201,6 @@ impl Vm { .get_irqchip() .save_device(mpidrs) .map_err(VmError::SaveGic)?, - kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), }) } @@ -319,39 +227,10 @@ impl Vm { pub struct VmState { /// GIC state. pub gic: GicState, - /// Additional capabilities that were specified in cpu template. - pub kvm_cap_modifiers: Vec, } #[cfg(target_arch = "x86_64")] impl Vm { - const DEFAULT_CAPABILITIES: [u32; 14] = [ - kvm_bindings::KVM_CAP_IRQCHIP, - kvm_bindings::KVM_CAP_IOEVENTFD, - kvm_bindings::KVM_CAP_IRQFD, - kvm_bindings::KVM_CAP_USER_MEMORY, - kvm_bindings::KVM_CAP_SET_TSS_ADDR, - kvm_bindings::KVM_CAP_PIT2, - kvm_bindings::KVM_CAP_PIT_STATE2, - kvm_bindings::KVM_CAP_ADJUST_CLOCK, - kvm_bindings::KVM_CAP_DEBUGREGS, - kvm_bindings::KVM_CAP_MP_STATE, - kvm_bindings::KVM_CAP_VCPU_EVENTS, - kvm_bindings::KVM_CAP_XCRS, - kvm_bindings::KVM_CAP_XSAVE, - kvm_bindings::KVM_CAP_EXT_CPUID, - ]; - - /// Returns a ref to the supported `CpuId` for this Vm. - pub fn supported_cpuid(&self) -> &CpuId { - &self.supported_cpuid - } - - /// Returns a ref to the list of serializable MSR indices. - pub fn msrs_to_save(&self) -> &MsrList { - &self.msrs_to_save - } - /// Restores the KVM VM state. /// /// # Errors @@ -431,7 +310,6 @@ impl Vm { pic_master, pic_slave, ioapic, - kvm_cap_modifiers: self.kvm_cap_modifiers.clone(), }) } } @@ -447,9 +325,6 @@ pub struct VmState { // TODO: rename this field to adopt inclusive language once Linux updates it, too. pic_slave: kvm_irqchip, ioapic: kvm_irqchip, - - /// Additional capabilities that were specified in cpu template. - pub kvm_cap_modifiers: Vec, } #[cfg(target_arch = "x86_64")] @@ -474,43 +349,30 @@ pub(crate) mod tests { use crate::vstate::memory::GuestMemoryMmap; // Auxiliary function being used throughout the tests. - pub(crate) fn setup_vm(mem_size: usize) -> (Vm, GuestMemoryMmap) { - let gm = single_region_mem(mem_size); + pub(crate) fn setup_vm() -> (Kvm, Vm) { + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + let vm = Vm::new(&kvm).expect("Cannot create new vm"); + (kvm, vm) + } - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + // Auxiliary function being used throughout the tests. + pub(crate) fn setup_vm_with_memory(mem_size: usize) -> (Kvm, Vm, GuestMemoryMmap) { + let (kvm, vm) = setup_vm(); + let gm = single_region_mem(mem_size); vm.memory_init(&gm, false).unwrap(); - - (vm, gm) + (kvm, vm, gm) } #[test] fn test_new() { // Testing with a valid /dev/kvm descriptor. - Vm::new(vec![]).unwrap(); - } - - #[test] - fn test_combine_capabilities() { - // Default caps for x86_64 and aarch64 both have KVM_CAP_IOEVENTFD and don't have - // KVM_CAP_IOMMU caps. - let additional_capabilities = vec![ - KvmCapability::Add(kvm_bindings::KVM_CAP_IOMMU), - KvmCapability::Remove(kvm_bindings::KVM_CAP_IOEVENTFD), - ]; - - let combined_caps = Vm::combine_capabilities(&additional_capabilities); - assert!(combined_caps - .iter() - .any(|c| *c == kvm_bindings::KVM_CAP_IOMMU)); - assert!(!combined_caps - .iter() - .any(|c| *c == kvm_bindings::KVM_CAP_IOEVENTFD)); + let kvm = Kvm::new(vec![]).expect("Cannot create Kvm"); + Vm::new(&kvm).unwrap(); } #[test] fn test_vm_memory_init() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); - + let (_, vm) = setup_vm(); // Create valid memory region and test that the initialization is successful. let gm = single_region_mem(0x1000); vm.memory_init(&gm, true).unwrap(); @@ -519,11 +381,11 @@ pub(crate) mod tests { #[cfg(target_arch = "x86_64")] #[test] fn test_vm_save_restore_state() { - let vm = Vm::new(vec![]).expect("new vm failed"); + let (_, vm) = setup_vm(); // Irqchips, clock and pitstate are not configured so trying to save state should fail. vm.save_state().unwrap_err(); - let (vm, _mem) = setup_vm(0x1000); + let (_, vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let vm_state = vm.save_state().unwrap(); @@ -536,7 +398,7 @@ pub(crate) mod tests { assert_eq!(vm_state.pic_slave.chip_id, KVM_IRQCHIP_PIC_SLAVE); assert_eq!(vm_state.ioapic.chip_id, KVM_IRQCHIP_IOAPIC); - let (mut vm, _mem) = setup_vm(0x1000); + let (_, mut vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); vm.restore_state(&vm_state).unwrap(); @@ -547,11 +409,11 @@ pub(crate) mod tests { fn test_vm_save_restore_state_bad_irqchip() { use kvm_bindings::KVM_NR_IRQCHIPS; - let (vm, _mem) = setup_vm(0x1000); + let (_, vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let mut vm_state = vm.save_state().unwrap(); - let (mut vm, _mem) = setup_vm(0x1000); + let (_, mut vm, _mem) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); // Try to restore an invalid PIC Master chip ID @@ -576,7 +438,7 @@ pub(crate) mod tests { fn test_vmstate_serde() { let mut snapshot_data = vec![0u8; 10000]; - let (mut vm, _) = setup_vm(0x1000); + let (_, mut vm, _) = setup_vm_with_memory(0x1000); vm.setup_irqchip().unwrap(); let state = vm.save_state().unwrap(); Snapshot::serialize(&mut snapshot_data.as_mut_slice(), &state).unwrap(); @@ -587,7 +449,7 @@ pub(crate) mod tests { #[test] fn test_set_kvm_memory_regions() { - let vm = Vm::new(vec![]).expect("Cannot create new vm"); + let (_, vm) = setup_vm(); let gm = single_region_mem(0x1000); let res = vm.set_kvm_memory_regions(&gm, false); diff --git a/tests/integration_tests/functional/test_snapshot_basic.py b/tests/integration_tests/functional/test_snapshot_basic.py index 4030bb4e981..875ef77dbaf 100644 --- a/tests/integration_tests/functional/test_snapshot_basic.py +++ b/tests/integration_tests/functional/test_snapshot_basic.py @@ -5,6 +5,7 @@ import filecmp import logging import os +import platform import re import shutil import time @@ -12,8 +13,11 @@ import pytest +import host_tools.cargo_build as host import host_tools.drive as drive_tools +from framework import utils from framework.microvm import SnapshotType +from framework.properties import global_props from framework.utils import check_filesystem, check_output from framework.utils_vsock import ( ECHO_SERVER_PORT, @@ -540,3 +544,56 @@ def test_vmgenid(guest_kernel_linux_6_1, rootfs, microvm_factory, snapshot_type) # Update the base for next iteration base_snapshot = snapshot + + +# TODO add `global_props.host_os == "amzn2"` condition +# once amazon linux kernels have patches. +@pytest.mark.skipif( + platform.machine() != "aarch64" or global_props.host_linux_version_tpl < (6, 4), + reason="This is aarch64 specific test and should only be run on 6.4 and later kernels", +) +def test_physical_couter_reset_aarch64(uvm_nano): + """ + Test that the CNTPCT_EL0 register is reset on VM boot. + We assume the smallest VM will not consume more than + some MAX_VALUE cycles to be created and snapshotted. + The MAX_VALUE is selected by doing a manual run of this test and + seeing what the actual counter value is. The assumption here is that + if resetting will not occur the guest counter value will be huge as it + will be a copy of host value. The host value in its turn will be huge because + it will include host OS boot + CI prep + other CI tests ... + """ + vm = uvm_nano + vm.add_net_iface() + vm.start() + + snapshot = vm.snapshot_full() + vm.kill() + snap_editor = host.get_binary("snapshot-editor") + + cntpct_el0 = hex(0x603000000013DF01) + # If a CPU runs at 3GHz, it will have a counter value of 1_000_000_000 + # in 1/3 of a second. The host surely will run for more than 1/3 second before + # executing this test. + max_value = 800_000_000 + + cmd = [ + str(snap_editor), + "info-vmstate", + "vcpu-states", + "--vmstate-path", + str(snapshot.vmstate), + ] + _, stdout, _ = utils.check_output(cmd) + + # The output will look like this: + # kvm_mp_state: 0x0 + # mpidr: 0x80000000 + # 0x6030000000100000 0x0000000e0 + # 0x6030000000100002 0xffff00fe33c0 + for line in stdout.splitlines(): + parts = line.split() + if len(parts) == 2: + reg_id, reg_value = parts + if reg_id == cntpct_el0: + assert int(reg_value, 16) < max_value