From 13f287d577d7fa5c23daa0c50898a49314e6baff Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Tue, 18 Mar 2025 22:47:31 +0800 Subject: [PATCH 01/20] [feat] bring host VM concept into x86_vcpu, do compile --- src/context.rs | 232 ++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 4 + src/msr.rs | 5 + src/regs.rs | 13 +++ src/segmentation.rs | 120 +++++++++++++++++++++++ src/vmx/vcpu.rs | 107 ++++++++++++++++++-- 6 files changed, 471 insertions(+), 10 deletions(-) create mode 100644 src/context.rs create mode 100644 src/segmentation.rs diff --git a/src/context.rs b/src/context.rs new file mode 100644 index 0000000..3d20593 --- /dev/null +++ b/src/context.rs @@ -0,0 +1,232 @@ +use x86::{segmentation, task}; +use x86_64::VirtAddr; +use x86_64::instructions::tables::{lgdt, lidt, sidt}; +use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr3Flags, Cr4, Cr4Flags}; +use x86_64::structures::DescriptorTablePointer; +use x86_64::{addr::PhysAddr, structures::paging::PhysFrame}; + +use crate::msr::Msr; +use crate::regs::GeneralRegisters; +use crate::segmentation::{Segment, SegmentAccessRights}; + +const SAVED_LINUX_REGS: usize = 8; + +#[derive(Debug, Clone, Copy)] +pub struct LinuxContext { + pub rsp: u64, + pub rip: u64, + + pub r15: u64, + pub r14: u64, + pub r13: u64, + pub r12: u64, + pub rbx: u64, + pub rbp: u64, + + pub es: Segment, + pub cs: Segment, + pub ss: Segment, + pub ds: Segment, + pub fs: Segment, + pub gs: Segment, + pub tss: Segment, + pub gdt: DescriptorTablePointer, + pub idt: DescriptorTablePointer, + + pub cr0: Cr0Flags, + pub cr3: u64, + pub cr4: Cr4Flags, + + pub efer: u64, + pub star: u64, + pub lstar: u64, + pub cstar: u64, + pub fmask: u64, + pub kernel_gsbase: u64, + pub pat: u64, + pub mtrr_def_type: u64, +} + +unsafe impl Send for LinuxContext {} +unsafe impl Sync for LinuxContext {} + +impl Default for LinuxContext { + fn default() -> Self { + Self { + rsp: 0, + rip: 0, + r15: 0, + r14: 0, + r13: 0, + r12: 0, + rbx: 0, + rbp: 0, + es: Segment::invalid(), + cs: Segment::invalid(), + ss: Segment::invalid(), + ds: Segment::invalid(), + fs: Segment::invalid(), + gs: Segment::invalid(), + tss: Segment::invalid(), + gdt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + idt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + cr0: Cr0Flags::empty(), + cr3: 0, + cr4: Cr4Flags::empty(), + efer: 0, + star: 0, + lstar: 0, + cstar: 0, + fmask: 0, + kernel_gsbase: 0, + pat: 0, + mtrr_def_type: 0, + } + } +} + +fn sgdt() -> DescriptorTablePointer { + let mut gdt = DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }; + unsafe { + core::arch::asm!("sgdt [{0}]", in(reg) &mut gdt, options(nostack, preserves_flags)); + } + gdt +} + +impl LinuxContext { + /// Load linux callee-saved registers from the stack, and other system registers. + pub fn load_from(linux_sp: usize) -> Self { + let regs = unsafe { core::slice::from_raw_parts(linux_sp as *const u64, SAVED_LINUX_REGS) }; + let gdt = sgdt(); + let idt = sidt(); + + let mut fs = Segment::from_selector(x86::segmentation::fs(), &gdt); + let mut gs = Segment::from_selector(x86::segmentation::gs(), &idt); + fs.base = Msr::IA32_FS_BASE.read(); + gs.base = regs[0]; + + Self { + rsp: regs.as_ptr_range().end as _, + r15: regs[1], + r14: regs[2], + r13: regs[3], + r12: regs[4], + rbx: regs[5], + rbp: regs[6], + rip: regs[7], + es: Segment::from_selector(segmentation::es(), &gdt), + cs: Segment::from_selector(segmentation::cs(), &gdt), + ss: Segment::from_selector(segmentation::ss(), &gdt), + ds: Segment::from_selector(segmentation::ds(), &gdt), + fs, + gs, + tss: Segment::from_selector(unsafe { task::tr() }, &gdt), + gdt, + idt, + cr0: Cr0::read(), + cr3: Cr3::read().0.start_address().as_u64(), + cr4: Cr4::read(), + efer: Msr::IA32_EFER.read(), + star: Msr::IA32_STAR.read(), + lstar: Msr::IA32_LSTAR.read(), + cstar: Msr::IA32_CSTAR.read(), + fmask: Msr::IA32_FMASK.read(), + kernel_gsbase: Msr::IA32_KERNEL_GSBASE.read(), + pat: Msr::IA32_PAT.read(), + mtrr_def_type: Msr::IA32_MTRR_DEF_TYPE.read(), + } + } + + /// Restore system registers. + pub fn restore(&self) { + unsafe { + Msr::IA32_EFER.write(self.efer); + Msr::IA32_STAR.write(self.star); + Msr::IA32_LSTAR.write(self.lstar); + Msr::IA32_CSTAR.write(self.cstar); + Msr::IA32_FMASK.write(self.fmask); + Msr::IA32_KERNEL_GSBASE.write(self.kernel_gsbase); + Msr::IA32_PAT.write(self.pat); + + Cr0::write(self.cr0); + Cr4::write(self.cr4); + // cr3 must be last in case cr4 enables PCID + Cr3::write( + PhysFrame::containing_address(PhysAddr::new(self.cr3)), + Cr3Flags::empty(), // clear PCID + ); + } + + // Copy Linux TSS descriptor into our GDT, clearing the busy flag, + // then reload TR from it. We can't use Linux' GDT as it is r/o. + + let hv_gdt = sgdt(); + let entry_count = (hv_gdt.limit as usize + 1) / size_of::(); + + let hv_gdt_table: &mut [u64] = + unsafe { core::slice::from_raw_parts_mut(hv_gdt.base.as_mut_ptr(), entry_count) }; + + // let mut hv_gdt = GdtStruct::from_pointer(&GdtStruct::sgdt()); + + let linux_gdt = &self.gdt; + let entry_count = (linux_gdt.limit as usize + 1) / size_of::(); + let linux_gdt_table = + unsafe { core::slice::from_raw_parts(linux_gdt.base.as_mut_ptr(), entry_count) }; + + // let liunx_gdt = GdtStruct::from_pointer(&self.gdt); + let tss_idx = self.tss.selector.index() as usize; + hv_gdt_table[tss_idx] = linux_gdt_table[tss_idx]; + hv_gdt_table[tss_idx + 1] = linux_gdt_table[tss_idx + 1]; + // hv_gdt.load_tss(self.tss.selector); + + SegmentAccessRights::set_descriptor_type( + &mut hv_gdt_table[self.tss.selector.index() as usize], + SegmentAccessRights::TSS_AVAIL, + ); + unsafe { + task::load_tr(self.tss.selector); + lgdt(&self.gdt); + lidt(&self.idt); + + segmentation::load_es(self.es.selector); + segmentation::load_cs(self.cs.selector); + segmentation::load_ss(self.ss.selector); + segmentation::load_ds(self.ds.selector); + segmentation::load_fs(self.fs.selector); + segmentation::load_gs(self.gs.selector); + + Msr::IA32_FS_BASE.write(self.fs.base); + } + } + + /// Restore linux general-purpose registers and stack, then return back to linux. + pub fn return_to_linux(&self, guest_regs: &GeneralRegisters) -> ! { + unsafe { + Msr::IA32_GS_BASE.write(self.gs.base); + core::arch::asm!( + "mov rsp, {linux_rsp}", + "push {linux_rip}", + "mov rcx, rsp", + "mov rsp, {guest_regs}", + "mov [rsp + {guest_regs_size}], rcx", + restore_regs_from_stack!(), + "pop rsp", + "ret", + linux_rsp = in(reg) self.rsp, + linux_rip = in(reg) self.rip, + guest_regs = in(reg) guest_regs, + guest_regs_size = const core::mem::size_of::(), + options(noreturn), + ); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index faab4b7..d715285 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,9 @@ pub(crate) mod regs; mod ept; mod frame; +mod context; +mod segmentation; + cfg_if::cfg_if! { if #[cfg(feature = "vmx")] { mod vmx; @@ -26,6 +29,7 @@ cfg_if::cfg_if! { } } +pub use context::LinuxContext; pub use ept::GuestPageWalkInfo; pub use regs::GeneralRegisters; pub use vender::has_hardware_support; diff --git a/src/msr.rs b/src/msr.rs index 09e4463..ac105e4 100644 --- a/src/msr.rs +++ b/src/msr.rs @@ -7,7 +7,12 @@ use x86::msr::{rdmsr, wrmsr}; pub enum Msr { IA32_FEATURE_CONTROL = 0x3a, + IA32_SYSENTER_CS = 0x174, + IA32_SYSENTER_ESP = 0x175, + IA32_SYSENTER_EIP = 0x176, + IA32_PAT = 0x277, + IA32_MTRR_DEF_TYPE = 0x2ff, IA32_VMX_BASIC = 0x480, IA32_VMX_PINBASED_CTLS = 0x481, diff --git a/src/regs.rs b/src/regs.rs index 2039c77..19bc9ee 100644 --- a/src/regs.rs +++ b/src/regs.rs @@ -40,6 +40,19 @@ pub struct GeneralRegisters { } impl GeneralRegisters { + pub fn from_context(context: &crate::context::LinuxContext) -> Self { + Self { + rax: 0, + rbx: context.rbx, + rbp: context.rbp, + r12: context.r12, + r13: context.r13, + r14: context.r14, + r15: context.r15, + ..Default::default() + } + } + /// Returns the value of the general-purpose register corresponding to the given index. /// /// The mapping of indices to registers is as follows: diff --git a/src/segmentation.rs b/src/segmentation.rs new file mode 100644 index 0000000..c068d86 --- /dev/null +++ b/src/segmentation.rs @@ -0,0 +1,120 @@ +use bit_field::BitField; +use bitflags::bitflags; +// use x86::dtables::DescriptorTablePointer; +use x86::segmentation::SegmentSelector; +use x86_64::structures::DescriptorTablePointer; +use x86_64::structures::gdt::DescriptorFlags; + +bitflags! { + /// Access rights for VMCS guest register states. + /// + /// The low 16 bits correspond to bits 23:8 of the upper 32 bits of a 64-bit + /// segment descriptor. See Volume 3, Section 24.4.1 for access rights format, + /// Volume 3, Section 3.4.5.1 for valid non-system selector types, Volume 3, + /// Section 3.5 for valid system selectors types. + #[derive(Debug, Clone, Copy)] + pub struct SegmentAccessRights: u32 { + /// Accessed flag. + const ACCESSED = 1 << 0; + /// For data segments, this flag sets the segment as writable. For code + /// segments, this flag sets the segment as readable. + const WRITABLE = 1 << 1; + /// For data segments, this flag marks a data segment as “expansion-direction”. + /// For code segments, this flag marks a code segment as “conforming”. + const CONFORMING = 1 << 2; + /// This flag must be set for code segments. + const EXECUTABLE = 1 << 3; + /// S — Descriptor type (0 = system; 1 = code or data) + const CODE_DATA = 1 << 4; + /// P — Segment present + const PRESENT = 1 << 7; + /// L - Reserved (except for CS) or 64-bit mode active (for CS only) + const LONG_MODE = 1 << 13; + /// D/B — Default operation size (0 = 16-bit segment; 1 = 32-bit segment) + const DB = 1 << 14; + /// G — Granularity + const GRANULARITY = 1 << 15; + /// Segment unusable (0 = usable; 1 = unusable) + const UNUSABLE = 1 << 16; + + /// TSS (Available) for 32/64-bit + const TSS_AVAIL = 0b1001; + /// TSS (Busy) for 32/64-bit + const TSS_BUSY = 0b1011; + + /// Descriptor privilege level (User) + const DPL_USER = 3 << 5; + } +} + +impl SegmentAccessRights { + #[allow(dead_code)] + pub fn dpl(&self) -> u8 { + self.bits().get_bits(5..=6) as _ + } + + pub fn from_descriptor(desc: u64) -> Self { + Self::from_bits_truncate(desc.get_bits(40..56) as u32 & 0xf0ff) + } + + pub fn _type_field(&self) -> Self { + Self::from_bits_truncate(self.bits() & 0xf) + } + + pub fn set_descriptor_type(desc: &mut u64, type_field: Self) { + desc.set_bits(40..44, type_field.bits() as u64); + } + + #[cfg(feature = "amd")] + pub fn as_svm_segment_attributes(&self) -> u16 { + let bits = self.bits() as u16; + (bits & 0xff) | ((bits & 0xf000) >> 4) + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Segment { + pub selector: SegmentSelector, + pub base: u64, + pub limit: u32, + pub access_rights: SegmentAccessRights, +} + +impl Segment { + pub const fn invalid() -> Self { + Self { + selector: SegmentSelector::empty(), + base: 0, + limit: 0, + access_rights: SegmentAccessRights::UNUSABLE, + } + } + + pub fn from_selector(selector: SegmentSelector, dt: &DescriptorTablePointer) -> Self { + let index = selector.index() as usize; + let entry_count = (dt.limit as usize + 1) / size_of::(); + let table = unsafe { core::slice::from_raw_parts(dt.base.as_mut_ptr(), entry_count) }; + + let entry_value = table[index]; + let entry = DescriptorFlags::from_bits_truncate(entry_value); + if entry.contains(DescriptorFlags::PRESENT) { + let mut base = entry_value.get_bits(16..40) | entry_value.get_bits(56..64) << 24; + let mut limit = entry_value.get_bits(0..16) | entry_value.get_bits(48..52) << 16; + if !entry.contains(DescriptorFlags::USER_SEGMENT) { + let high = table[index + 1]; + base += high << 32; + } + if entry.contains(DescriptorFlags::GRANULARITY) { + limit = (limit << 12) | 0xfff; + } + Self { + selector, + base, + limit: limit as _, + access_rights: SegmentAccessRights::from_descriptor(entry_value), + } + } else { + Self::invalid() + } + } +} diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index dfb1b3e..7051eef 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -16,11 +16,14 @@ use axvcpu::{AccessWidth, AxArchVCpu, AxVCpuExitReason, AxVCpuHal}; use super::VmxExitInfo; use super::as_axerr; use super::definitions::VmxExitReason; +use super::read_vmcs_revision_id; use super::structs::{IOBitmap, MsrBitmap, VmxRegion}; use super::vmcs::{ self, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16, VmcsGuest32, VmcsGuest64, VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, }; +use crate::LinuxContext; +use crate::segmentation::Segment; use crate::{ept::GuestPageWalkInfo, msr::Msr, regs::GeneralRegisters}; const VMX_PREEMPTION_TIMER_SET_VALUE: u32 = 1_000_000; @@ -155,27 +158,34 @@ pub struct VmxVcpu { xstate: XState, entry: Option, ept_root: Option, - // is_host: bool, temporary removed because we don't care about type 1.5 now + host_ctx: Option, } impl VmxVcpu { /// Create a new [`VmxVcpu`]. - pub fn new() -> AxResult { - let vmcs_revision_id = super::read_vmcs_revision_id(); + pub fn new(ctx: Option) -> AxResult { let vcpu = Self { - guest_regs: GeneralRegisters::default(), + guest_regs: if let Some(ctx) = ctx { + GeneralRegisters::from_context(&ctx) + } else { + GeneralRegisters::default() + }, host_stack_top: 0, launched: false, - vmcs: VmxRegion::new(vmcs_revision_id, false)?, + vmcs: VmxRegion::new(read_vmcs_revision_id(), false)?, io_bitmap: IOBitmap::passthrough_all()?, msr_bitmap: MsrBitmap::passthrough_all()?, pending_events: VecDeque::with_capacity(8), xstate: XState::new(), entry: None, ept_root: None, - // is_host: false, + host_ctx: ctx, }; - info!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr()); + info!( + "[HV] created {} VmxVcpu(vmcs: {:#x})", + if ctx.is_some() { "Host" } else { "Guest" }, + vcpu.vmcs.phys_addr() + ); Ok(vcpu) } @@ -501,13 +511,20 @@ impl VmxVcpu { } fn setup_vmcs(&mut self, entry: GuestPhysAddr, ept_root: HostPhysAddr) -> AxResult { + // If self.host_ctx.is_none(), it means this is a vcpu for guest VM. + let is_guest = self.host_ctx.is_none(); let paddr = self.vmcs.phys_addr().as_usize() as u64; unsafe { vmx::vmclear(paddr).map_err(as_axerr)?; } self.bind_to_current_processor()?; - self.setup_vmcs_guest(entry)?; - self.setup_vmcs_control(ept_root, true)?; + if is_guest { + self.setup_vmcs_guest(entry)?; + } else { + self.setup_vmcs_guest_from_ctx()?; + } + + self.setup_vmcs_control(ept_root, is_guest)?; self.unbind_from_current_processor()?; Ok(()) } @@ -549,6 +566,70 @@ impl VmxVcpu { Ok(()) } + /// Indeed, this function can be combined with `setup_vmcs_guest`, + /// to avoid complexity and minimize the modification, + /// we just keep them separated. + fn setup_vmcs_guest_from_ctx(&mut self) -> AxResult { + let linux = self.host_ctx.expect("Host context is not set"); + + self.set_cr(0, linux.cr0.bits()); + self.set_cr(4, linux.cr4.bits()); + self.set_cr(3, linux.cr3); + + macro_rules! set_guest_segment { + ($seg: expr, $reg: ident) => {{ + use VmcsGuest16::*; + use VmcsGuest32::*; + use VmcsGuestNW::*; + concat_idents!($reg, _SELECTOR).write($seg.selector.bits())?; + concat_idents!($reg, _BASE).write($seg.base as _)?; + concat_idents!($reg, _LIMIT).write($seg.limit)?; + concat_idents!($reg, _ACCESS_RIGHTS).write($seg.access_rights.bits())?; + }}; + } + + set_guest_segment!(linux.es, ES); + set_guest_segment!(linux.cs, CS); + set_guest_segment!(linux.ss, SS); + set_guest_segment!(linux.ds, DS); + set_guest_segment!(linux.fs, FS); + set_guest_segment!(linux.gs, GS); + set_guest_segment!(linux.tss, TR); + set_guest_segment!(Segment::invalid(), LDTR); + + VmcsGuestNW::GDTR_BASE.write(linux.gdt.base.as_u64() as _)?; + VmcsGuest32::GDTR_LIMIT.write(linux.gdt.limit as _)?; + VmcsGuestNW::IDTR_BASE.write(linux.idt.base.as_u64() as _)?; + VmcsGuest32::IDTR_LIMIT.write(linux.idt.limit as _)?; + + debug!( + "this is the linux rip: {:#x} rsp:{:#x}", + linux.rip, linux.rsp + ); + VmcsGuestNW::RSP.write(linux.rsp as _)?; + VmcsGuestNW::RIP.write(linux.rip as _)?; + VmcsGuestNW::RFLAGS.write(0x2)?; + + VmcsGuest32::IA32_SYSENTER_CS.write(Msr::IA32_SYSENTER_CS.read() as _)?; + VmcsGuestNW::IA32_SYSENTER_ESP.write(Msr::IA32_SYSENTER_ESP.read() as _)?; + VmcsGuestNW::IA32_SYSENTER_EIP.write(Msr::IA32_SYSENTER_EIP.read() as _)?; + + VmcsGuestNW::DR7.write(0x400)?; + VmcsGuest64::IA32_DEBUGCTL.write(0)?; + + VmcsGuest32::ACTIVITY_STATE.write(0)?; + VmcsGuest32::INTERRUPTIBILITY_STATE.write(0)?; + VmcsGuestNW::PENDING_DBG_EXCEPTIONS.write(0)?; + + VmcsGuest64::LINK_PTR.write(u64::MAX)?; + VmcsGuest32::VMX_PREEMPTION_TIMER_VALUE.write(0)?; + + VmcsGuest64::IA32_PAT.write(linux.pat)?; + VmcsGuest64::IA32_EFER.write(linux.efer)?; + + Ok(()) + } + fn setup_vmcs_guest(&mut self, entry: GuestPhysAddr) -> AxResult { let cr0_val: Cr0Flags = Cr0Flags::NOT_WRITE_THROUGH | Cr0Flags::CACHE_DISABLE | Cr0Flags::EXTENSION_TYPE; @@ -1146,8 +1227,14 @@ impl AxArchVCpu for VmxVcpu { type SetupConfig = (); + type HostConfig = crate::context::LinuxContext; + fn new(_config: Self::CreateConfig) -> AxResult { - Self::new() + Self::new(None) + } + + fn new_host(config: Self::HostConfig) -> AxResult { + Self::new(Some(config)) } fn set_entry(&mut self, entry: GuestPhysAddr) -> AxResult { From 919173ea5c7d60123e4b1a62ae9ad4ce7bd212ab Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 20 Mar 2025 22:19:43 +0800 Subject: [PATCH 02/20] [wip] support type15 in axvisor, problem in xstate --- src/context.rs | 14 +++++------ src/lib.rs | 2 ++ src/segmentation.rs | 6 ++--- src/vmx/percpu.rs | 3 ++- src/vmx/vcpu.rs | 61 +++++++++++++++++++++++++++++++++++++-------- src/xstate.rs | 32 ++++++++++++++++++++++++ 6 files changed, 97 insertions(+), 21 deletions(-) create mode 100644 src/xstate.rs diff --git a/src/context.rs b/src/context.rs index 3d20593..78243f9 100644 --- a/src/context.rs +++ b/src/context.rs @@ -8,6 +8,7 @@ use x86_64::{addr::PhysAddr, structures::paging::PhysFrame}; use crate::msr::Msr; use crate::regs::GeneralRegisters; use crate::segmentation::{Segment, SegmentAccessRights}; +use crate::xstate::XState; const SAVED_LINUX_REGS: usize = 8; @@ -45,6 +46,8 @@ pub struct LinuxContext { pub kernel_gsbase: u64, pub pat: u64, pub mtrr_def_type: u64, + + pub xstate: XState, } unsafe impl Send for LinuxContext {} @@ -87,6 +90,7 @@ impl Default for LinuxContext { kernel_gsbase: 0, pat: 0, mtrr_def_type: 0, + xstate: XState::default(), } } } @@ -107,10 +111,9 @@ impl LinuxContext { pub fn load_from(linux_sp: usize) -> Self { let regs = unsafe { core::slice::from_raw_parts(linux_sp as *const u64, SAVED_LINUX_REGS) }; let gdt = sgdt(); - let idt = sidt(); let mut fs = Segment::from_selector(x86::segmentation::fs(), &gdt); - let mut gs = Segment::from_selector(x86::segmentation::gs(), &idt); + let mut gs = Segment::from_selector(x86::segmentation::gs(), &gdt); fs.base = Msr::IA32_FS_BASE.read(); gs.base = regs[0]; @@ -131,7 +134,7 @@ impl LinuxContext { gs, tss: Segment::from_selector(unsafe { task::tr() }, &gdt), gdt, - idt, + idt: sidt(), cr0: Cr0::read(), cr3: Cr3::read().0.start_address().as_u64(), cr4: Cr4::read(), @@ -143,6 +146,7 @@ impl LinuxContext { kernel_gsbase: Msr::IA32_KERNEL_GSBASE.read(), pat: Msr::IA32_PAT.read(), mtrr_def_type: Msr::IA32_MTRR_DEF_TYPE.read(), + xstate: XState::new(), } } @@ -175,18 +179,14 @@ impl LinuxContext { let hv_gdt_table: &mut [u64] = unsafe { core::slice::from_raw_parts_mut(hv_gdt.base.as_mut_ptr(), entry_count) }; - // let mut hv_gdt = GdtStruct::from_pointer(&GdtStruct::sgdt()); - let linux_gdt = &self.gdt; let entry_count = (linux_gdt.limit as usize + 1) / size_of::(); let linux_gdt_table = unsafe { core::slice::from_raw_parts(linux_gdt.base.as_mut_ptr(), entry_count) }; - // let liunx_gdt = GdtStruct::from_pointer(&self.gdt); let tss_idx = self.tss.selector.index() as usize; hv_gdt_table[tss_idx] = linux_gdt_table[tss_idx]; hv_gdt_table[tss_idx + 1] = linux_gdt_table[tss_idx + 1]; - // hv_gdt.load_tss(self.tss.selector); SegmentAccessRights::set_descriptor_type( &mut hv_gdt_table[self.tss.selector.index() as usize], diff --git a/src/lib.rs b/src/lib.rs index d715285..e7e92dd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,6 +17,8 @@ mod frame; mod context; mod segmentation; +// mod tables; +mod xstate; cfg_if::cfg_if! { if #[cfg(feature = "vmx")] { diff --git a/src/segmentation.rs b/src/segmentation.rs index c068d86..4853cf2 100644 --- a/src/segmentation.rs +++ b/src/segmentation.rs @@ -90,10 +90,10 @@ impl Segment { } } - pub fn from_selector(selector: SegmentSelector, dt: &DescriptorTablePointer) -> Self { + pub fn from_selector(selector: SegmentSelector, gdt: &DescriptorTablePointer) -> Self { let index = selector.index() as usize; - let entry_count = (dt.limit as usize + 1) / size_of::(); - let table = unsafe { core::slice::from_raw_parts(dt.base.as_mut_ptr(), entry_count) }; + let entry_count = (gdt.limit as usize + 1) / size_of::(); + let table = unsafe { core::slice::from_raw_parts(gdt.base.as_mut_ptr(), entry_count) }; let entry_value = table[index]; let entry = DescriptorFlags::from_bits_truncate(entry_value); diff --git a/src/vmx/percpu.rs b/src/vmx/percpu.rs index 6e1e743..04b6334 100644 --- a/src/vmx/percpu.rs +++ b/src/vmx/percpu.rs @@ -8,6 +8,7 @@ use memory_addr::PAGE_SIZE_4K as PAGE_SIZE; use crate::msr::Msr; use crate::vmx::has_hardware_support; use crate::vmx::structs::{FeatureControl, FeatureControlFlags, VmxBasic, VmxRegion}; +use crate::xstate::XState; /// Represents the per-CPU state for Virtual Machine Extensions (VMX). /// @@ -49,7 +50,7 @@ impl AxArchPerCpu for VmxPerCpuState { } // Enable XSAVE/XRSTOR. - super::vcpu::XState::enable_xsave(); + XState::enable_xsave(); // Enable VMXON, if required. let ctrl = FeatureControl::read(); diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 7051eef..8bf06a6 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -20,10 +20,11 @@ use super::read_vmcs_revision_id; use super::structs::{IOBitmap, MsrBitmap, VmxRegion}; use super::vmcs::{ self, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16, VmcsGuest32, VmcsGuest64, - VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, + VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, interrupt_exit_info, }; use crate::LinuxContext; use crate::segmentation::Segment; +use crate::xstate::XState; use crate::{ept::GuestPageWalkInfo, msr::Msr, regs::GeneralRegisters}; const VMX_PREEMPTION_TIMER_SET_VALUE: u32 = 1_000_000; @@ -176,15 +177,20 @@ impl VmxVcpu { io_bitmap: IOBitmap::passthrough_all()?, msr_bitmap: MsrBitmap::passthrough_all()?, pending_events: VecDeque::with_capacity(8), - xstate: XState::new(), + xstate: if let Some(ctx) = ctx { + ctx.xstate + } else { + XState::new() + }, entry: None, ept_root: None, host_ctx: ctx, }; - info!( - "[HV] created {} VmxVcpu(vmcs: {:#x})", + debug!( + "[HV] created {} VmxVcpu(vmcs: {:#x}) xstate {:#x?}", if ctx.is_some() { "Host" } else { "Guest" }, - vcpu.vmcs.phys_addr() + vcpu.vmcs.phys_addr(), + vcpu.xstate ); Ok(vcpu) } @@ -202,7 +208,7 @@ impl VmxVcpu { /// Bind this [`VmxVcpu`] to current logical processor. pub fn bind_to_current_processor(&self) -> AxResult { - debug!( + trace!( "VmxVcpu bind to current processor vmcs @ {:#x}", self.vmcs.phys_addr() ); @@ -215,7 +221,7 @@ impl VmxVcpu { /// Unbind this [`VmxVcpu`] from current logical processor. pub fn unbind_from_current_processor(&self) -> AxResult { - debug!( + trace!( "VmxVcpu unbind from current processor vmcs @ {:#x}", self.vmcs.phys_addr() ); @@ -270,7 +276,7 @@ impl VmxVcpu { // Handle vm-exits let exit_info = self.exit_info().unwrap(); - // debug!("VM exit: {:#x?}", exit_info); + debug!("VM exit: {:#x?}", exit_info); match self.builtin_vmexit_handler(&exit_info) { Some(result) => { @@ -572,6 +578,12 @@ impl VmxVcpu { fn setup_vmcs_guest_from_ctx(&mut self) -> AxResult { let linux = self.host_ctx.expect("Host context is not set"); + warn!("Linux context: {:#x?}", linux); + + warn!("self xstate {:#x?}", self.xstate); + + warn!("current xstate {:#x?}", XState::new()); + self.set_cr(0, linux.cr0.bits()); self.set_cr(4, linux.cr4.bits()); self.set_cr(3, linux.cr3); @@ -718,7 +730,10 @@ impl VmxVcpu { // Enable EPT, RDTSCP, INVPCID, and unrestricted guest. use SecondaryControls as CpuCtrl2; - let mut val = CpuCtrl2::ENABLE_EPT | CpuCtrl2::UNRESTRICTED_GUEST; + let mut val = CpuCtrl2::ENABLE_EPT + | CpuCtrl2::UNRESTRICTED_GUEST + | CpuCtrl2::ENABLE_USER_WAIT_PAUSE + | CpuCtrl2::ENABLE_VM_FUNCTIONS; if let Some(features) = raw_cpuid.get_extended_processor_and_feature_identifiers() { if features.has_rdtscp() { val |= CpuCtrl2::ENABLE_RDTSCP; @@ -784,9 +799,15 @@ impl VmxVcpu { VmcsControl32::VMEXIT_MSR_LOAD_COUNT.write(0)?; VmcsControl32::VMENTRY_MSR_LOAD_COUNT.write(0)?; - // VmcsControlNW::CR4_GUEST_HOST_MASK.write(0)?; + // TODO: figure out why we mask it. + VmcsControlNW::CR4_GUEST_HOST_MASK.write(0)?; VmcsControl32::CR3_TARGET_COUNT.write(0)?; + // 25.6.14 VM-Function Controls + // Table 25-10. Definitions of VM-Function Controls + // Bit 0: EPTP switching + VmcsControl64::VM_FUNCTION_CONTROLS.write(0b1)?; + // Pass-through exceptions (except #UD(6)), don't use I/O bitmap, set MSR bitmaps. let exception_bitmap: u32 = 1 << 6; @@ -982,6 +1003,7 @@ impl VmxVcpu { VmxExitReason::XSETBV => Some(self.handle_xsetbv()), VmxExitReason::CR_ACCESS => Some(self.handle_cr()), VmxExitReason::CPUID => Some(self.handle_cpuid()), + VmxExitReason::EXCEPTION_NMI => Some(self.handle_exception_nmi(exit_info)), _ => None, } } @@ -996,6 +1018,25 @@ impl VmxVcpu { Ok(()) } + fn handle_exception_nmi(&mut self, exit_info: &VmxExitInfo) -> AxResult { + let intr_info = interrupt_exit_info()?; + info!( + "VM exit: Exception or NMI @ RIP({:#x}, {}): {:#x?}", + exit_info.guest_rip, exit_info.exit_instruction_length, intr_info + ); + + const NON_MASKABLE_INTERRUPT: u8 = 2; + + match intr_info.vector { + // ExceptionType::NonMaskableInterrupt + NON_MASKABLE_INTERRUPT => unsafe { + core::arch::asm!("int {}", const NON_MASKABLE_INTERRUPT) + }, + v => panic!("Unhandled Guest Exception: #{:#x}", v), + } + Ok(()) + } + #[allow(clippy::single_match)] fn handle_cr(&mut self) -> AxResult { const VM_EXIT_INSTR_LEN_MV_TO_CR: u8 = 3; diff --git a/src/xstate.rs b/src/xstate.rs new file mode 100644 index 0000000..3536442 --- /dev/null +++ b/src/xstate.rs @@ -0,0 +1,32 @@ +use x86::controlregs::{Xcr0, xcr0 as xcr0_read, xcr0_write}; +use x86_64::registers::control::{Cr4, Cr4Flags}; + +use crate::msr::Msr; + +#[derive(Debug, Clone, Copy, Default)] +pub struct XState { + pub host_xcr0: u64, + pub guest_xcr0: u64, + pub host_xss: u64, + pub guest_xss: u64, +} + +impl XState { + /// Create a new [`XState`] instance with current host state + pub fn new() -> Self { + let xcr0 = unsafe { xcr0_read().bits() }; + let xss = Msr::IA32_XSS.read(); + + Self { + host_xcr0: xcr0, + guest_xcr0: xcr0, + host_xss: xss, + guest_xss: xss, + } + } + + /// Enables extended processor state management instructions, including XGETBV and XSAVE. + pub fn enable_xsave() { + unsafe { Cr4::write(Cr4::read() | Cr4Flags::OSXSAVE) }; + } +} From 1464dd942cba2ed0d215c4630d2d45ac0bb36938 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Sat, 22 Mar 2025 12:18:32 +0800 Subject: [PATCH 03/20] [feat] support instruction decode --- Cargo.toml | 9 ++ src/ept.rs | 26 ------ src/frame.rs | 8 +- src/lib.rs | 2 +- src/page_table.rs | 205 ++++++++++++++++++++++++++++++++++++++++++++++ src/vmx/vcpu.rs | 155 +++++++++++++++++++++++++++++------ src/vmx/vmcs.rs | 2 +- src/xstate.rs | 43 +++++++--- 8 files changed, 381 insertions(+), 69 deletions(-) create mode 100644 src/page_table.rs diff --git a/Cargo.toml b/Cargo.toml index a13a65f..aa9c9d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,9 +12,18 @@ x86 = "0.52" x86_64 = "0.15" raw-cpuid = "11.0" numeric-enum-macro = "0.2" +iced-x86 = { version = "1.12.0", features = [ + "decoder", + "no_std", + "intel", + "op_code_info", + "encoder", + "masm" +], default-features = false } axerrno = "0.1.0" page_table_entry = "0.5" +page_table_multiarch = "0.5" memory_addr = "0.3.1" crate_interface = "0.1" diff --git a/src/ept.rs b/src/ept.rs index 74549b3..8b13789 100644 --- a/src/ept.rs +++ b/src/ept.rs @@ -1,27 +1 @@ -#[derive(Debug)] -/// The information of guest page walk. -pub struct GuestPageWalkInfo { - /// The guest page table physical address. - pub top_entry: usize, // Top level paging structure entry - /// Guest page table level. - pub level: usize, - /// Guest page table width - pub width: u32, - /// Guest page table user mode - pub is_user_mode_access: bool, - /// Guest page table write access - pub is_write_access: bool, - /// Guest page table instruction fetch - pub is_inst_fetch: bool, - /// CR4.PSE for 32bit paging, true for PAE/4-level paging - pub pse: bool, - /// CR0.WP - pub wp: bool, // CR0.WP - /// MSR_IA32_EFER_NXE_BIT - pub nxe: bool, - /// Guest page table Supervisor mode access prevention - pub is_smap_on: bool, - /// Guest page table Supervisor mode execution protection - pub is_smep_on: bool, -} diff --git a/src/frame.rs b/src/frame.rs index 8bb37c2..05525cf 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -1,5 +1,7 @@ use core::marker::PhantomData; +use page_table_multiarch::PagingHandler; + use axaddrspace::HostPhysAddr; use axerrno::{AxResult, ax_err_type}; @@ -17,7 +19,7 @@ pub struct PhysFrame { impl PhysFrame { pub fn alloc() -> AxResult { - let start_paddr = H::alloc_frame() + let start_paddr = H::PagingHandler::alloc_frame() .ok_or_else(|| ax_err_type!(NoMemory, "allocate physical frame failed"))?; assert_ne!(start_paddr.as_usize(), 0); Ok(Self { @@ -44,7 +46,7 @@ impl PhysFrame { } pub fn as_mut_ptr(&self) -> *mut u8 { - H::phys_to_virt(self.start_paddr()).as_mut_ptr() + H::PagingHandler::phys_to_virt(self.start_paddr()).as_mut_ptr() } pub fn fill(&mut self, byte: u8) { @@ -55,7 +57,7 @@ impl PhysFrame { impl Drop for PhysFrame { fn drop(&mut self) { if let Some(start_paddr) = self.start_paddr { - H::dealloc_frame(start_paddr); + H::PagingHandler::dealloc_frame(start_paddr); debug!("[AxVM] deallocated PhysFrame({:#x})", start_paddr); } } diff --git a/src/lib.rs b/src/lib.rs index e7e92dd..d7f9945 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ pub(crate) mod msr; pub(crate) mod regs; mod ept; mod frame; +mod page_table; mod context; mod segmentation; @@ -32,6 +33,5 @@ cfg_if::cfg_if! { } pub use context::LinuxContext; -pub use ept::GuestPageWalkInfo; pub use regs::GeneralRegisters; pub use vender::has_hardware_support; diff --git a/src/page_table.rs b/src/page_table.rs new file mode 100644 index 0000000..55a9ac8 --- /dev/null +++ b/src/page_table.rs @@ -0,0 +1,205 @@ +//! Used to query and manipulate the page tables of a guest. +use core::marker::PhantomData; + +use memory_addr::MemoryAddr; +use page_table_entry::{GenericPTE, MappingFlags}; +use page_table_multiarch::{PageSize, PagingError, PagingHandler, PagingResult}; + +use axaddrspace::{EPTTranslator, GuestPhysAddr, GuestVirtAddr, HostPhysAddr}; +use axerrno::{AxError, AxResult}; + +const fn p5_index(vaddr: usize) -> usize { + (vaddr >> (12 + 36)) & (ENTRY_COUNT - 1) +} + +const fn p4_index(vaddr: usize) -> usize { + (vaddr >> (12 + 27)) & (ENTRY_COUNT - 1) +} + +const fn p3_index(vaddr: usize) -> usize { + (vaddr >> (12 + 18)) & (ENTRY_COUNT - 1) +} + +const fn p2_index(vaddr: usize) -> usize { + (vaddr >> (12 + 9)) & (ENTRY_COUNT - 1) +} + +const fn p1_index(vaddr: usize) -> usize { + (vaddr >> 12) & (ENTRY_COUNT - 1) +} + +#[derive(Debug)] +/// The information of guest page walk. +pub struct GuestPageWalkInfo { + /// Guest VM cr3 value. + pub cr3: usize, + /// Guest page table level. + pub level: usize, + /// Guest page table width + pub width: u32, + /// Guest page table user mode + pub is_user_mode_access: bool, + /// Guest page table write access + pub is_write_access: bool, + /// Guest page table instruction fetch + pub is_inst_fetch: bool, + /// CR4.PSE for 32bit paging, true for PAE/4-level paging + pub pse: bool, + /// CR0.WP + pub wp: bool, // CR0.WP + /// MSR_IA32_EFER_NXE_BIT + pub nxe: bool, + + /// Guest page table Supervisor mode access prevention + pub is_smap_on: bool, + /// Guest page table Supervisor mode execution protection + pub is_smep_on: bool, +} + +// /// Metadata of guest page tables. +// pub struct GuestPageTableMetadata; + +// impl PagingMetaData for GuestPageTableMetadata { +// const LEVELS: usize = 4; +// const PA_MAX_BITS: usize = 52; +// const VA_MAX_BITS: usize = 48; + +// type VirtAddr = GuestVirtAddr; +// type PhysAddr = GuestPhysAddr; + +// fn to_actual_paddr(paddr: Self::PhysAddr) -> HostPhysAddr { +// EPT::guest_phys_to_host_phys(paddr).unwrap() +// } + +// fn flush_tlb(_vaddr: Option) { +// warn!("flush_tlb is not implemented for guest page tables"); +// } +// } + +const ENTRY_COUNT: usize = 512; + +// pub type GuestPageTable = PageTable64, X64PTE, H>; + +/// A generic page table struct for 64-bit platform. +/// +/// It also tracks all intermediate level tables. They will be deallocated +/// When the [`GuestPageTable64`] itself is dropped. +pub struct GuestPageTable64 { + root_paddr: GuestPhysAddr, + levels: usize, + _phantom: PhantomData<(PTE, H, EPT)>, +} + +impl GuestPageTable64 { + /// Create a new page table. + pub fn construct(guest_ptw_info: &GuestPageWalkInfo) -> Self { + debug!( + "GuestPageTable64::construct CR3: {:?} level {}", + guest_ptw_info.cr3, guest_ptw_info.level + ); + const PHYS_ADDR_MASK: usize = 0x000f_ffff_ffff_f000; // bits 12..52 + + Self { + root_paddr: GuestPhysAddr::from(guest_ptw_info.cr3 & &PHYS_ADDR_MASK), + levels: guest_ptw_info.level, + _phantom: PhantomData, + } + } + + /// Get the root page table physical address. + pub fn root_paddr(&self) -> GuestPhysAddr { + self.root_paddr + } + + /// Queries the result of the mapping starts with `vaddr`. + /// + /// Returns the physical address of the target frame, mapping flags, and + /// the page size. + /// + /// Returns [`Err(PagingError::NotMapped)`](PagingError::NotMapped) if the + /// mapping is not present. + pub fn query( + &self, + vaddr: GuestVirtAddr, + ) -> PagingResult<(GuestPhysAddr, MappingFlags, PageSize)> { + let (entry, size) = self.get_entry(vaddr)?; + if entry.is_unused() { + return Err(PagingError::NotMapped); + } + let off = size.align_offset(vaddr.into()); + Ok((entry.paddr().add(off).into(), entry.flags(), size)) + } +} + +// private implements +impl GuestPageTable64 { + fn table_of<'a>(&self, gpa: GuestPhysAddr) -> &'a [PTE] { + let hpa = EPT::guest_phys_to_host_phys(gpa).unwrap(); + let ptr = H::phys_to_virt(hpa).as_ptr() as _; + + debug!( + "GuestPageTable64::table_of gpa: {:?} hpa: {:?} ptr: {:p}", + gpa, hpa, ptr + ); + + unsafe { core::slice::from_raw_parts(ptr, ENTRY_COUNT) } + } + + fn table_of_mut<'a>(&mut self, gpa: GuestPhysAddr) -> &'a mut [PTE] { + let hpa = EPT::guest_phys_to_host_phys(gpa).unwrap(); + let ptr = H::phys_to_virt(hpa).as_ptr() as _; + unsafe { core::slice::from_raw_parts_mut(ptr, ENTRY_COUNT) } + } + + fn next_table<'a>(&self, entry: &PTE) -> PagingResult<&'a [PTE]> { + if !entry.is_present() { + Err(PagingError::NotMapped) + } else if entry.is_huge() { + Err(PagingError::MappedToHugePage) + } else { + Ok(self.table_of(entry.paddr().into())) + } + } + + fn get_entry(&self, gva: GuestVirtAddr) -> PagingResult<(&PTE, PageSize)> { + let vaddr: usize = gva.into(); + + let p3 = if self.levels == 3 { + self.table_of(self.root_paddr()) + } else if self.levels == 4 { + let p4 = self.table_of(self.root_paddr()); + let p4e = &p4[p4_index(vaddr)]; + self.next_table(p4e)? + } else { + // 5-level paging + let p5 = self.table_of(self.root_paddr()); + let p5e = &p5[p5_index(vaddr)]; + if p5e.is_huge() { + return Err(PagingError::MappedToHugePage); + } + let p4 = self.next_table(p5e)?; + let p4e = &p4[p4_index(vaddr)]; + + if p4e.is_huge() { + return Err(PagingError::MappedToHugePage); + } + + self.next_table(p4e)? + }; + + let p3e = &p3[p3_index(vaddr)]; + if p3e.is_huge() { + return Ok((p3e, PageSize::Size1G)); + } + + let p2 = self.next_table(p3e)?; + let p2e = &p2[p2_index(vaddr)]; + if p2e.is_huge() { + return Ok((p2e, PageSize::Size2M)); + } + + let p1 = self.next_table(p2e)?; + let p1e = &p1[p1_index(vaddr)]; + Ok((p1e, PageSize::Size4K)) + } +} diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 8bf06a6..f7c8bb1 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -1,7 +1,9 @@ use alloc::collections::VecDeque; -use bit_field::BitField; +use alloc::vec::Vec; use core::fmt::{Debug, Formatter, Result}; use core::{arch::naked_asm, mem::size_of}; + +use bit_field::BitField; use raw_cpuid::CpuId; use x86::bits64::vmx; use x86::controlregs::{Xcr0, xcr0 as xcr0_read, xcr0_write}; @@ -9,7 +11,11 @@ use x86::dtables::{self, DescriptorTablePointer}; use x86::segmentation::SegmentSelector; use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr4, Cr4Flags, EferFlags}; -use axaddrspace::{GuestPhysAddr, GuestVirtAddr, HostPhysAddr, NestedPageFaultInfo}; +use page_table_entry::x86_64::X64PTE; +use page_table_multiarch::{PageSize, PagingHandler, PagingResult}; + +use axaddrspace::EPTTranslator; +use axaddrspace::{GuestPhysAddr, GuestVirtAddr, HostPhysAddr, MappingFlags, NestedPageFaultInfo}; use axerrno::{AxResult, ax_err, ax_err_type}; use axvcpu::{AccessWidth, AxArchVCpu, AxVCpuExitReason, AxVCpuHal}; @@ -23,9 +29,11 @@ use super::vmcs::{ VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, interrupt_exit_info, }; use crate::LinuxContext; +use crate::page_table::GuestPageTable64; +use crate::page_table::GuestPageWalkInfo; use crate::segmentation::Segment; use crate::xstate::XState; -use crate::{ept::GuestPageWalkInfo, msr::Msr, regs::GeneralRegisters}; +use crate::{msr::Msr, regs::GeneralRegisters}; const VMX_PREEMPTION_TIMER_SET_VALUE: u32 = 1_000_000; @@ -156,7 +164,11 @@ pub struct VmxVcpu { io_bitmap: IOBitmap, msr_bitmap: MsrBitmap, pending_events: VecDeque<(u8, Option)>, - xstate: XState, + // xstate: XState, + /// XState used by the guest OS, loaded before running the guest. + guest_xstate: XState, + /// XState used by the hypervisor itself, stored before running the guest. + cur_xstate: XState, entry: Option, ept_root: Option, host_ctx: Option, @@ -177,11 +189,12 @@ impl VmxVcpu { io_bitmap: IOBitmap::passthrough_all()?, msr_bitmap: MsrBitmap::passthrough_all()?, pending_events: VecDeque::with_capacity(8), - xstate: if let Some(ctx) = ctx { + guest_xstate: if let Some(ctx) = ctx { ctx.xstate } else { XState::new() }, + cur_xstate: XState::new(), entry: None, ept_root: None, host_ctx: ctx, @@ -190,7 +203,7 @@ impl VmxVcpu { "[HV] created {} VmxVcpu(vmcs: {:#x}) xstate {:#x?}", if ctx.is_some() { "Host" } else { "Guest" }, vcpu.vmcs.phys_addr(), - vcpu.xstate + vcpu.guest_xstate ); Ok(vcpu) } @@ -340,24 +353,9 @@ impl VmxVcpu { VmcsGuestNW::RSP.write(rsp).unwrap() } - /// Translate guest virtual addr to linear addr - pub fn gla2gva(&self, guest_rip: GuestVirtAddr) -> GuestVirtAddr { - let cpu_mode = self.get_cpu_mode(); - let seg_base = if cpu_mode == VmCpuMode::Mode64 { - 0 - } else { - VmcsGuestNW::CS_BASE.read().unwrap() - }; - // debug!( - // "seg_base: {:#x}, guest_rip: {:#x} cpu mode:{:?}", - // seg_base, guest_rip, cpu_mode - // ); - guest_rip + seg_base - } - /// Get Translate guest page table info - pub fn get_ptw_info(&self) -> GuestPageWalkInfo { - let top_entry = VmcsGuestNW::CR3.read().unwrap(); + pub fn get_pagetable_walk_info(&self) -> GuestPageWalkInfo { + let cr3 = VmcsGuestNW::CR3.read().unwrap(); let level = self.get_paging_level(); let is_write_access = false; let is_inst_fetch = false; @@ -384,7 +382,7 @@ impl VmxVcpu { width = 0; } GuestPageWalkInfo { - top_entry, + cr3, level, width, is_user_mode_access, @@ -446,6 +444,67 @@ impl VmxVcpu { self.msr_bitmap.set_read_intercept(msr, intercept); self.msr_bitmap.set_write_intercept(msr, intercept); } + + pub fn read_guest_memory(&self, gva: GuestVirtAddr, len: usize) -> AxResult> { + debug!("read_guest_memory @{:?} len: {}", gva, len); + + let mut content = Vec::with_capacity(len as usize); + + let mut remained_size = len; + let mut addr = gva; + + while remained_size > 0 { + let (gpa, _flags, page_size) = self.guest_page_table_query(gva).map_err(|e| { + warn!("Failed to query guest page table: {:?}", e); + ax_err_type!(BadAddress) + })?; + let pgoff = page_size.align_offset(addr.into()); + let read_size = (page_size as usize - pgoff).min(remained_size); + addr += read_size; + remained_size -= read_size; + + if let Some(hpa) = H::EPTTranslator::guest_phys_to_host_phys(gpa) { + let hva_ptr = H::PagingHandler::phys_to_virt(hpa).as_ptr(); + for i in 0..read_size { + content.push(unsafe { hva_ptr.add(pgoff + i).read() }); + } + } else { + return ax_err!(BadAddress); + } + } + debug!("read_guest_memory @{:?} content: {:x?}", gva, content); + Ok(content) + } + + pub fn decode_instruction(&self, rip: GuestVirtAddr, instr_len: usize) -> AxResult { + use alloc::string::String; + use iced_x86::{Decoder, DecoderOptions, Formatter, IntelFormatter, MasmFormatter}; + + let bytes = self.read_guest_memory(rip, instr_len)?; + let mut decoder = Decoder::with_ip( + 64, + bytes.as_slice(), + rip.as_usize() as _, + DecoderOptions::NONE, + ); + let instr = decoder.decode(); + + debug!("Decoded instruction: {:#x?}", instr); + + let mut output = String::new(); + let mut formattor = IntelFormatter::new(); + formattor.format(&instr, &mut output); + + debug!("Decoded instruction Intel formatter: {}", output); + + let mut output = String::new(); + let mut formattor = MasmFormatter::new(); + formattor.format(&instr, &mut output); + + debug!("Decoded instruction MasmFormatter: {}", output); + + Ok(()) + } } // Implementation of private methods @@ -580,7 +639,10 @@ impl VmxVcpu { warn!("Linux context: {:#x?}", linux); - warn!("self xstate {:#x?}", self.xstate); + warn!( + "self xstate cur {:#x?} guest {:#x?}", + self.cur_xstate, self.guest_xstate + ); warn!("current xstate {:#x?}", XState::new()); @@ -840,6 +902,32 @@ impl VmxVcpu { } level as usize } + + /// Translate guest virtual addr to linear addr + fn gva_to_linear_addr(&self, vaddr: GuestVirtAddr) -> GuestVirtAddr { + let cpu_mode = self.get_cpu_mode(); + let seg_base = if cpu_mode == VmCpuMode::Mode64 { + 0 + } else { + VmcsGuestNW::CS_BASE.read().unwrap() + }; + vaddr + seg_base + } + + fn guest_page_table_query( + &self, + gva: GuestVirtAddr, + ) -> PagingResult<(GuestPhysAddr, MappingFlags, PageSize)> { + let addr = self.gva_to_linear_addr(gva); + + debug!("guest_page_table_query: gva {:?} linear {:?}", gva, addr); + + let guest_ptw_info = self.get_pagetable_walk_info(); + let guest_page_table: GuestPageTable64 = + GuestPageTable64::construct(&guest_ptw_info); + + guest_page_table.query(addr) + } } // Implementaton for type1.5 hypervisor @@ -1025,6 +1113,11 @@ impl VmxVcpu { exit_info.guest_rip, exit_info.exit_instruction_length, intr_info ); + self.decode_instruction( + GuestVirtAddr::from_usize(exit_info.guest_rip), + exit_info.exit_instruction_length as _, + )?; + const NON_MASKABLE_INTERRUPT: u8 = 2; match intr_info.vector { @@ -1113,6 +1206,8 @@ impl VmxVcpu { res } LEAF_PROCESSOR_EXTENDED_STATE_ENUMERATION => { + warn!("handle_cpuid: LEAF_PROCESSOR_EXTENDED_STATE_ENUMERATION"); + self.load_guest_xstate(); let res = cpuid!(regs_clone.rax, regs_clone.rcx); self.load_host_xstate(); @@ -1201,7 +1296,7 @@ impl VmxVcpu { }) .ok_or(ax_err_type!(InvalidInput)) .and_then(|x| { - self.xstate.guest_xcr0 = x.bits(); + self.guest_xstate.xcr0 = x; self.advance_rip(VM_EXIT_INSTR_LEN_XSETBV) }) } else { @@ -1210,10 +1305,18 @@ impl VmxVcpu { } } + /// Save the current host state to the vcpu, + /// restore the guest state from the vcpu into registers. + /// + /// This function is generally called before VM-entry. fn load_guest_xstate(&mut self) { self.xstate.switch_to_guest(); } + /// Save the current guest state to the vcpu, + /// restore the host state from the vcpu into registers. + /// + /// This function is generally called after VM-exit. fn load_host_xstate(&mut self) { self.xstate.switch_to_host(); } diff --git a/src/vmx/vmcs.rs b/src/vmx/vmcs.rs index 2b8ba99..aa3616e 100644 --- a/src/vmx/vmcs.rs +++ b/src/vmx/vmcs.rs @@ -660,7 +660,7 @@ pub fn raw_interrupt_exit_info() -> AxResult { } pub fn interrupt_exit_info() -> AxResult { - // SDM Vol. 3C, Section 24.9.2 + // SDM Vol. 3C, Section 25.9.2 let info = VmcsReadOnly32::VMEXIT_INTERRUPTION_INFO.read()?; Ok(VmxInterruptInfo { vector: info.get_bits(0..8) as u8, diff --git a/src/xstate.rs b/src/xstate.rs index 3536442..a3e3ef4 100644 --- a/src/xstate.rs +++ b/src/xstate.rs @@ -3,25 +3,44 @@ use x86_64::registers::control::{Cr4, Cr4Flags}; use crate::msr::Msr; -#[derive(Debug, Clone, Copy, Default)] +#[derive(Debug, Clone, Copy)] pub struct XState { - pub host_xcr0: u64, - pub guest_xcr0: u64, - pub host_xss: u64, - pub guest_xss: u64, + pub xcr0: Xcr0, + pub xss: u64, +} + +impl Default for XState { + fn default() -> Self { + Self { + xcr0: Xcr0::empty(), + xss: 0, + } + } } impl XState { /// Create a new [`XState`] instance with current host state pub fn new() -> Self { - let xcr0 = unsafe { xcr0_read().bits() }; - let xss = Msr::IA32_XSS.read(); - Self { - host_xcr0: xcr0, - guest_xcr0: xcr0, - host_xss: xss, - guest_xss: xss, + xcr0: unsafe { xcr0_read() }, + xss: Msr::IA32_XSS.read(), + } + } + + pub fn save(&mut self) { + self.xcr0 = unsafe { xcr0_read() }; + self.xss = Msr::IA32_XSS.read(); + warn!("XState::save: xcr0: {:?}, xss: {:#x}", self.xcr0, self.xss); + } + + pub fn restore(&self) { + warn!( + "XState::restore: xcr0: {:?}, xss: {:#x}", + self.xcr0, self.xss + ); + unsafe { + xcr0_write(self.xcr0); + Msr::IA32_XSS.write(self.xss); } } From 189852edb2a0050e4a6888d75de6f518febb9623 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Sat, 22 Mar 2025 18:14:09 +0800 Subject: [PATCH 04/20] [bug] remove xstate saving to bypass xstate related unsolved problems --- src/page_table.rs | 9 +-------- src/vmx/percpu.rs | 19 ++++++++++++++++++- src/vmx/vcpu.rs | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/page_table.rs b/src/page_table.rs index 55a9ac8..6f592fa 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -5,8 +5,7 @@ use memory_addr::MemoryAddr; use page_table_entry::{GenericPTE, MappingFlags}; use page_table_multiarch::{PageSize, PagingError, PagingHandler, PagingResult}; -use axaddrspace::{EPTTranslator, GuestPhysAddr, GuestVirtAddr, HostPhysAddr}; -use axerrno::{AxError, AxResult}; +use axaddrspace::{EPTTranslator, GuestPhysAddr, GuestVirtAddr}; const fn p5_index(vaddr: usize) -> usize { (vaddr >> (12 + 36)) & (ENTRY_COUNT - 1) @@ -145,12 +144,6 @@ impl GuestPageTable64(&mut self, gpa: GuestPhysAddr) -> &'a mut [PTE] { - let hpa = EPT::guest_phys_to_host_phys(gpa).unwrap(); - let ptr = H::phys_to_virt(hpa).as_ptr() as _; - unsafe { core::slice::from_raw_parts_mut(ptr, ENTRY_COUNT) } - } - fn next_table<'a>(&self, entry: &PTE) -> PagingResult<&'a [PTE]> { if !entry.is_present() { Err(PagingError::NotMapped) diff --git a/src/vmx/percpu.rs b/src/vmx/percpu.rs index 04b6334..065eb03 100644 --- a/src/vmx/percpu.rs +++ b/src/vmx/percpu.rs @@ -101,9 +101,26 @@ impl AxArchPerCpu for VmxPerCpuState { self.vmcs_revision_id = vmx_basic.revision_id; self.vmx_region = VmxRegion::new(self.vmcs_revision_id, false)?; + use x86_64::registers::control::{Cr0Flags, Cr4Flags}; + const HOST_CR0: Cr0Flags = Cr0Flags::from_bits_truncate( + Cr0Flags::PAGING.bits() + | Cr0Flags::WRITE_PROTECT.bits() + | Cr0Flags::NUMERIC_ERROR.bits() + | Cr0Flags::TASK_SWITCHED.bits() + | Cr0Flags::MONITOR_COPROCESSOR.bits() + | Cr0Flags::PROTECTED_MODE_ENABLE.bits(), + ); + const HOST_CR4: Cr4Flags = Cr4Flags::from_bits_truncate( + Cr4Flags::PHYSICAL_ADDRESS_EXTENSION.bits() + | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS.bits() + | Cr4Flags::OSXSAVE.bits(), + ); + unsafe { // Enable VMX using the VMXE bit. - Cr4::write(Cr4::read() | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS); + // Cr4::write(Cr4::read() | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS); + Cr0::write(HOST_CR0); + Cr4::write(HOST_CR4); // Execute VMXON. vmx::vmxon(self.vmx_region.phys_addr().as_usize() as _).map_err(|err| { ax_err_type!( diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index f7c8bb1..aafe554 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -6,7 +6,7 @@ use core::{arch::naked_asm, mem::size_of}; use bit_field::BitField; use raw_cpuid::CpuId; use x86::bits64::vmx; -use x86::controlregs::{Xcr0, xcr0 as xcr0_read, xcr0_write}; +use x86::controlregs::Xcr0; use x86::dtables::{self, DescriptorTablePointer}; use x86::segmentation::SegmentSelector; use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr4, Cr4Flags, EferFlags}; From bdd32877bb0cc61b1d31d4a8d5e93b310b670366 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Wed, 26 Mar 2025 15:17:08 +0800 Subject: [PATCH 05/20] [fix] delete redundant warn msg --- src/vmx/vcpu.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index aafe554..44a946c 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -289,7 +289,7 @@ impl VmxVcpu { // Handle vm-exits let exit_info = self.exit_info().unwrap(); - debug!("VM exit: {:#x?}", exit_info); + trace!("VM exit: {:#x?}", exit_info); match self.builtin_vmexit_handler(&exit_info) { Some(result) => { @@ -1206,12 +1206,9 @@ impl VmxVcpu { res } LEAF_PROCESSOR_EXTENDED_STATE_ENUMERATION => { - warn!("handle_cpuid: LEAF_PROCESSOR_EXTENDED_STATE_ENUMERATION"); - self.load_guest_xstate(); let res = cpuid!(regs_clone.rax, regs_clone.rcx); self.load_host_xstate(); - res } LEAF_HYPERVISOR_INFO => CpuIdResult { From 6304f18e52b1fcbf93d1bcc2d9b6a25fd3017f8c Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 27 Mar 2025 10:01:37 +0800 Subject: [PATCH 06/20] [fix] delete redundant outputs --- src/vmx/vcpu.rs | 16 +--------------- src/vmx/vmcs.rs | 2 +- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 44a946c..48126c6 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -200,10 +200,9 @@ impl VmxVcpu { host_ctx: ctx, }; debug!( - "[HV] created {} VmxVcpu(vmcs: {:#x}) xstate {:#x?}", + "[HV] created {} VmxVcpu(vmcs: {:#x})", if ctx.is_some() { "Host" } else { "Guest" }, vcpu.vmcs.phys_addr(), - vcpu.guest_xstate ); Ok(vcpu) } @@ -637,15 +636,6 @@ impl VmxVcpu { fn setup_vmcs_guest_from_ctx(&mut self) -> AxResult { let linux = self.host_ctx.expect("Host context is not set"); - warn!("Linux context: {:#x?}", linux); - - warn!( - "self xstate cur {:#x?} guest {:#x?}", - self.cur_xstate, self.guest_xstate - ); - - warn!("current xstate {:#x?}", XState::new()); - self.set_cr(0, linux.cr0.bits()); self.set_cr(4, linux.cr4.bits()); self.set_cr(3, linux.cr3); @@ -676,10 +666,6 @@ impl VmxVcpu { VmcsGuestNW::IDTR_BASE.write(linux.idt.base.as_u64() as _)?; VmcsGuest32::IDTR_LIMIT.write(linux.idt.limit as _)?; - debug!( - "this is the linux rip: {:#x} rsp:{:#x}", - linux.rip, linux.rsp - ); VmcsGuestNW::RSP.write(linux.rsp as _)?; VmcsGuestNW::RIP.write(linux.rip as _)?; VmcsGuestNW::RFLAGS.write(0x2)?; diff --git a/src/vmx/vmcs.rs b/src/vmx/vmcs.rs index aa3616e..4b31463 100644 --- a/src/vmx/vmcs.rs +++ b/src/vmx/vmcs.rs @@ -597,7 +597,7 @@ pub fn set_control( let allowed0 = cap as u32; let allowed1 = (cap >> 32) as u32; assert_eq!(allowed0 & allowed1, allowed0); - debug!( + trace!( "set {:?}: {:#x} (+{:#x}, -{:#x})", control, old_value, set, clear ); From c7c40ac4be9dcebcf3c8659e221d7756aa0eb970 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Fri, 28 Mar 2025 22:00:29 +0800 Subject: [PATCH 07/20] [feat] introduce AxVcpuAccessGuestState --- src/page_table.rs | 8 +++--- src/vmx/vcpu.rs | 66 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/src/page_table.rs b/src/page_table.rs index 6f592fa..6a4873c 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -136,10 +136,10 @@ impl GuestPageTable64 VmxVcpu { vaddr + seg_base } - fn guest_page_table_query( + pub fn guest_page_table_query( &self, gva: GuestVirtAddr, ) -> PagingResult<(GuestPhysAddr, MappingFlags, PageSize)> { let addr = self.gva_to_linear_addr(gva); - debug!("guest_page_table_query: gva {:?} linear {:?}", gva, addr); + // debug!("guest_page_table_query: gva {:?} linear {:?}", gva, addr); let guest_ptw_info = self.get_pagetable_walk_info(); let guest_page_table: GuestPageTable64 = @@ -1467,3 +1467,63 @@ impl AxArchVCpu for VmxVcpu { self.regs_mut().set_reg_of_index(reg as u8, val as u64); } } + +impl AxVcpuAccessGuestState for VmxVcpu { + fn read_gpr(&self, reg: usize) -> usize { + self.regs().get_reg_of_index(reg as u8) as usize + } + + fn write_gpr(&mut self, reg: usize, val: usize) { + self.regs_mut().set_reg_of_index(reg as u8, val as u64); + } + + fn instr_pointer(&self) -> usize { + VmcsGuestNW::RIP.read().expect("Failed to read RIP") as usize + } + + fn set_instr_pointer(&mut self, val: usize) { + VmcsGuestNW::RIP.write(val as _).expect("Failed to set RIP"); + } + + fn stack_pointer(&self) -> usize { + self.stack_pointer() + } + + fn set_stack_pointer(&mut self, val: usize) { + self.set_stack_pointer(val); + } + + fn frame_pointer(&self) -> usize { + self.regs().rbp as usize + } + + fn set_frame_pointer(&mut self, val: usize) { + self.regs_mut().rbp = val as u64; + } + + fn return_value(&self) -> usize { + self.regs().rax as usize + } + + fn set_return_value(&mut self, val: usize) { + self.regs_mut().rax = val as u64; + } + + fn guest_is_privileged(&self) -> bool { + use crate::segmentation::SegmentAccessRights; + SegmentAccessRights::from_bits_truncate( + VmcsGuest32::CS_ACCESS_RIGHTS + .read() + .expect("Failed to read CS_ACCESS_RIGHTS"), + ) + .dpl() + == 0 + } + + fn guest_page_table_query( + &self, + gva: GuestVirtAddr, + ) -> Option<(GuestPhysAddr, MappingFlags, PageSize)> { + self.guest_page_table_query(gva).ok() + } +} From 43856d1cf82f5d0a956ab81782ea8fe55b2eedc8 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 3 Apr 2025 00:07:29 +0800 Subject: [PATCH 08/20] [feat] support EPTP list and load guest state --- src/context.rs | 9 +++++ src/vmx/structs.rs | 52 ++++++++++++++++++++++++ src/vmx/vcpu.rs | 98 ++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 152 insertions(+), 7 deletions(-) diff --git a/src/context.rs b/src/context.rs index 78243f9..46a00cb 100644 --- a/src/context.rs +++ b/src/context.rs @@ -208,6 +208,15 @@ impl LinuxContext { } } + pub fn load_guest_regs(&mut self, regs: &GeneralRegisters) { + self.r15 = regs.r15; + self.r14 = regs.r14; + self.r13 = regs.r13; + self.r12 = regs.r12; + self.rbx = regs.rbx; + self.rbp = regs.rbp; + } + /// Restore linux general-purpose registers and stack, then return back to linux. pub fn return_to_linux(&self, guest_regs: &GeneralRegisters) -> ! { unsafe { diff --git a/src/vmx/structs.rs b/src/vmx/structs.rs index fa3f698..72e642c 100644 --- a/src/vmx/structs.rs +++ b/src/vmx/structs.rs @@ -256,6 +256,8 @@ bitflags! { const WALK_LENGTH_3 = 2 << 3; /// EPT page-walk length 4. const WALK_LENGTH_4 = 3 << 3; + /// EPT page-walk length 5 + const WALK_LENGTH_5 = 4 << 3; /// Setting this control to 1 enables accessed and dirty flags for EPT. const ENABLE_ACCESSED_DIRTY = 1 << 6; } @@ -268,3 +270,53 @@ impl EPTPointer { flags | Self::MEM_TYPE_WB | Self::WALK_LENGTH_4 | Self::ENABLE_ACCESSED_DIRTY } } + +pub const EPTP_LIST_SIZE: usize = 512; + +/// EPTP list, the 4-KByte structure, +/// The EPTP list comprises 512 8-Byte entries (each an EPTP value) +/// and is used by the EPTP-switching VM function (see Section 26.5.6.3). +pub(super) struct EptpList { + frame: PhysFrame, +} + +impl EptpList { + pub fn new() -> AxResult { + Ok(Self { + frame: PhysFrame::alloc_zero()?, + }) + } + + pub fn phys_addr(&self) -> HostPhysAddr { + self.frame.start_paddr() + } + + pub fn set_entry(&mut self, idx: usize, eptp: EPTPointer) { + assert!(idx < EPTP_LIST_SIZE); + // Todo: validate eptp refer to 26.5.6.3 EPTP Switching. + let ptr = self.frame.as_mut_ptr() as *mut u64; + unsafe { + ptr.add(idx).write(eptp.bits()); + } + } + + pub fn entry_is_set(&self, idx: usize) -> bool { + assert!(idx < EPTP_LIST_SIZE); + let ptr = self.frame.as_mut_ptr() as *const u64; + unsafe { ptr.add(idx).read() != 0 } + } + + pub fn get_entry(&self, idx: usize) -> EPTPointer { + assert!(idx < EPTP_LIST_SIZE); + let ptr = self.frame.as_mut_ptr() as *const u64; + unsafe { EPTPointer::from_bits_truncate(ptr.add(idx).read()) } + } + + pub fn remove_entry(&mut self, idx: usize) { + assert!(idx < EPTP_LIST_SIZE); + let ptr = self.frame.as_mut_ptr() as *mut u64; + unsafe { + ptr.add(idx).write(0); + } + } +} diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index eb7ee69..994cbf3 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -2,6 +2,7 @@ use alloc::collections::VecDeque; use alloc::vec::Vec; use core::fmt::{Debug, Formatter, Result}; use core::{arch::naked_asm, mem::size_of}; +use x86_64::VirtAddr; use bit_field::BitField; use raw_cpuid::CpuId; @@ -23,7 +24,7 @@ use super::VmxExitInfo; use super::as_axerr; use super::definitions::VmxExitReason; use super::read_vmcs_revision_id; -use super::structs::{IOBitmap, MsrBitmap, VmxRegion}; +use super::structs::{EPTP_LIST_SIZE, EPTPointer, EptpList, IOBitmap, MsrBitmap, VmxRegion}; use super::vmcs::{ self, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16, VmcsGuest32, VmcsGuest64, VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, interrupt_exit_info, @@ -163,6 +164,8 @@ pub struct VmxVcpu { vmcs: VmxRegion, io_bitmap: IOBitmap, msr_bitmap: MsrBitmap, + eptp_list: EptpList, + pending_events: VecDeque<(u8, Option)>, // xstate: XState, /// XState used by the guest OS, loaded before running the guest. @@ -188,6 +191,7 @@ impl VmxVcpu { vmcs: VmxRegion::new(read_vmcs_revision_id(), false)?, io_bitmap: IOBitmap::passthrough_all()?, msr_bitmap: MsrBitmap::passthrough_all()?, + eptp_list: EptpList::new()?, pending_events: VecDeque::with_capacity(8), guest_xstate: if let Some(ctx) = ctx { ctx.xstate @@ -207,12 +211,6 @@ impl VmxVcpu { Ok(vcpu) } - /// Set the new [`VmxVcpu`] context from guest OS. - pub fn setup(&mut self, ept_root: HostPhysAddr, entry: GuestPhysAddr) -> AxResult { - self.setup_vmcs(entry, ept_root)?; - Ok(()) - } - // /// Get the identifier of this [`VmxVcpu`]. // pub fn vcpu_id(&self) -> usize { // get_current_vcpu::().unwrap().id() @@ -856,6 +854,13 @@ impl VmxVcpu { // Bit 0: EPTP switching VmcsControl64::VM_FUNCTION_CONTROLS.write(0b1)?; + assert!( + self.eptp_list.entry_is_set(0), + "The First EPTP list entry must be set as initial EPTP." + ); + + VmcsControl64::EPTP_LIST_ADDR.write(self.eptp_list.phys_addr().as_usize() as _)?; + // Pass-through exceptions (except #UD(6)), don't use I/O bitmap, set MSR bitmaps. let exception_bitmap: u32 = 1 << 6; @@ -868,6 +873,37 @@ impl VmxVcpu { Ok(()) } + fn load_vmcs_guest(&self, linux: &mut LinuxContext) { + linux.rip = VmcsGuestNW::RIP.read().unwrap() as _; + linux.rsp = VmcsGuestNW::RSP.read().unwrap() as _; + linux.cr0 = Cr0Flags::from_bits_truncate(VmcsGuestNW::CR0.read().unwrap() as _); + linux.cr3 = VmcsGuestNW::CR3.read().unwrap() as _; + linux.cr4 = Cr4Flags::from_bits_truncate(VmcsGuestNW::CR4.read().unwrap() as _); + + linux.es.selector = SegmentSelector::from_raw(VmcsGuest16::ES_SELECTOR.read().unwrap()); + linux.cs.selector = SegmentSelector::from_raw(VmcsGuest16::CS_SELECTOR.read().unwrap()); + linux.ss.selector = SegmentSelector::from_raw(VmcsGuest16::SS_SELECTOR.read().unwrap()); + linux.ds.selector = SegmentSelector::from_raw(VmcsGuest16::DS_SELECTOR.read().unwrap()); + linux.fs.selector = SegmentSelector::from_raw(VmcsGuest16::FS_SELECTOR.read().unwrap()); + linux.fs.base = VmcsGuestNW::FS_BASE.read().unwrap() as _; + linux.gs.selector = SegmentSelector::from_raw(VmcsGuest16::GS_SELECTOR.read().unwrap()); + linux.gs.base = VmcsGuestNW::GS_BASE.read().unwrap() as _; + linux.tss.selector = SegmentSelector::from_raw(VmcsGuest16::TR_SELECTOR.read().unwrap()); + + linux.gdt.base = VirtAddr::new(VmcsGuestNW::GDTR_BASE.read().unwrap() as _); + linux.gdt.limit = VmcsGuest32::GDTR_LIMIT.read().unwrap() as _; + linux.idt.base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read().unwrap() as _); + linux.idt.limit = VmcsGuest32::IDTR_LIMIT.read().unwrap() as _; + + linux.load_guest_regs(self.regs()); + + // unsafe { + // Msr::IA32_SYSENTER_CS.write(VmcsGuest32::IA32_SYSENTER_CS.read().unwrap() as _); + // Msr::IA32_SYSENTER_ESP.write(VmcsGuestNW::IA32_SYSENTER_ESP.read().unwrap() as _); + // Msr::IA32_SYSENTER_EIP.write(VmcsGuestNW::IA32_SYSENTER_EIP.read().unwrap() as _); + // } + } + fn get_paging_level(&self) -> usize { let mut level: u32 = 0; // non-paging let cr0 = VmcsGuestNW::CR0.read().unwrap(); @@ -1364,6 +1400,12 @@ impl AxArchVCpu for VmxVcpu { Self::new(Some(config)) } + fn load_host(&self) -> AxResult { + let mut linux = LinuxContext::default(); + self.load_vmcs_guest(&mut linux); + Ok(linux) + } + fn set_entry(&mut self, entry: GuestPhysAddr) -> AxResult { self.entry = Some(entry); Ok(()) @@ -1371,6 +1413,7 @@ impl AxArchVCpu for VmxVcpu { fn set_ept_root(&mut self, ept_root: HostPhysAddr) -> AxResult { self.ept_root = Some(ept_root); + self.append_eptp_list(0, ept_root)?; Ok(()) } @@ -1526,4 +1569,45 @@ impl AxVcpuAccessGuestState for VmxVcpu { ) -> Option<(GuestPhysAddr, MappingFlags, PageSize)> { self.guest_page_table_query(gva).ok() } + + fn append_eptp_list(&mut self, idx: usize, eptp: HostPhysAddr) -> AxResult { + if idx >= EPTP_LIST_SIZE { + return ax_err!(InvalidInput); + } + + if self.eptp_list.entry_is_set(idx) { + return ax_err!(InvalidInput); + } + + self.eptp_list + .set_entry(idx, EPTPointer::from_table_phys(eptp)); + Ok(()) + } + + fn remove_eptp_list_entry(&mut self, idx: usize) -> AxResult { + if idx >= EPTP_LIST_SIZE { + return ax_err!(InvalidInput); + } + if !self.eptp_list.entry_is_set(idx) { + return ax_err!(InvalidInput); + } + + self.eptp_list.remove_entry(idx); + + Ok(()) + } + + fn get_eptp_list_entry(&self, idx: usize) -> AxResult { + if idx >= EPTP_LIST_SIZE { + return ax_err!(InvalidInput); + } + if !self.eptp_list.entry_is_set(idx) { + return ax_err!(InvalidInput); + } + let entry = self.eptp_list.get_entry(idx); + + Ok(HostPhysAddr::from_usize(memory_addr::align_down_4k( + entry.bits() as _, + ))) + } } From 3f586849e2334eaff1ec0d6b28369b173c8efef3 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Mon, 7 Apr 2025 10:24:32 +0800 Subject: [PATCH 09/20] [feat] delete redundant output --- src/page_table.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/page_table.rs b/src/page_table.rs index 6a4873c..1550da6 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -92,10 +92,6 @@ pub struct GuestPageTable64 GuestPageTable64 { /// Create a new page table. pub fn construct(guest_ptw_info: &GuestPageWalkInfo) -> Self { - debug!( - "GuestPageTable64::construct CR3: {:?} level {}", - guest_ptw_info.cr3, guest_ptw_info.level - ); const PHYS_ADDR_MASK: usize = 0x000f_ffff_ffff_f000; // bits 12..52 Self { From bf96fe542955018f76916b842274d86d10057844 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Tue, 8 Apr 2025 00:28:06 +0800 Subject: [PATCH 10/20] [feat] introduce the concept of GeneralRegisters --- src/vmx/vcpu.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 994cbf3..6a666de 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -1512,6 +1512,16 @@ impl AxArchVCpu for VmxVcpu { } impl AxVcpuAccessGuestState for VmxVcpu { + type GeneralRegisters = GeneralRegisters; + + fn regs(&self) -> &Self::GeneralRegisters { + self.regs() + } + + fn regs_mut(&mut self) -> &mut Self::GeneralRegisters { + self.regs_mut() + } + fn read_gpr(&self, reg: usize) -> usize { self.regs().get_reg_of_index(reg as u8) as usize } From 8218ca4f7a1e7cae42b0f2be43609169171ef5a4 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 10 Apr 2025 11:55:30 +0800 Subject: [PATCH 11/20] [feat] refactor the settings of ia32_sysenter_xx msrs --- src/context.rs | 15 +++++++++++++++ src/regs.rs | 1 + src/vmx/vcpu.rs | 23 ++++++++++------------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/context.rs b/src/context.rs index 46a00cb..b729197 100644 --- a/src/context.rs +++ b/src/context.rs @@ -43,6 +43,11 @@ pub struct LinuxContext { pub lstar: u64, pub cstar: u64, pub fmask: u64, + + pub ia32_sysenter_cs: u64, + pub ia32_sysenter_esp: u64, + pub ia32_sysenter_eip: u64, + pub kernel_gsbase: u64, pub pat: u64, pub mtrr_def_type: u64, @@ -87,6 +92,9 @@ impl Default for LinuxContext { lstar: 0, cstar: 0, fmask: 0, + ia32_sysenter_cs: 0, + ia32_sysenter_esp: 0, + ia32_sysenter_eip: 0, kernel_gsbase: 0, pat: 0, mtrr_def_type: 0, @@ -143,6 +151,9 @@ impl LinuxContext { lstar: Msr::IA32_LSTAR.read(), cstar: Msr::IA32_CSTAR.read(), fmask: Msr::IA32_FMASK.read(), + ia32_sysenter_cs: Msr::IA32_SYSENTER_CS.read(), + ia32_sysenter_esp: Msr::IA32_SYSENTER_ESP.read(), + ia32_sysenter_eip: Msr::IA32_SYSENTER_EIP.read(), kernel_gsbase: Msr::IA32_KERNEL_GSBASE.read(), pat: Msr::IA32_PAT.read(), mtrr_def_type: Msr::IA32_MTRR_DEF_TYPE.read(), @@ -153,6 +164,10 @@ impl LinuxContext { /// Restore system registers. pub fn restore(&self) { unsafe { + Msr::IA32_SYSENTER_CS.write(self.ia32_sysenter_cs); + Msr::IA32_SYSENTER_ESP.write(self.ia32_sysenter_esp); + Msr::IA32_SYSENTER_EIP.write(self.ia32_sysenter_eip); + Msr::IA32_EFER.write(self.efer); Msr::IA32_STAR.write(self.star); Msr::IA32_LSTAR.write(self.lstar); diff --git a/src/regs.rs b/src/regs.rs index 19bc9ee..c970024 100644 --- a/src/regs.rs +++ b/src/regs.rs @@ -167,6 +167,7 @@ impl GeneralRegisters { macro_rules! save_regs_to_stack { () => { " + .code64 push r15 push r14 push r13 diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 6a666de..b0de048 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -668,9 +668,9 @@ impl VmxVcpu { VmcsGuestNW::RIP.write(linux.rip as _)?; VmcsGuestNW::RFLAGS.write(0x2)?; - VmcsGuest32::IA32_SYSENTER_CS.write(Msr::IA32_SYSENTER_CS.read() as _)?; - VmcsGuestNW::IA32_SYSENTER_ESP.write(Msr::IA32_SYSENTER_ESP.read() as _)?; - VmcsGuestNW::IA32_SYSENTER_EIP.write(Msr::IA32_SYSENTER_EIP.read() as _)?; + VmcsGuest32::IA32_SYSENTER_CS.write(linux.ia32_sysenter_cs as _)?; + VmcsGuestNW::IA32_SYSENTER_ESP.write(linux.ia32_sysenter_esp as _)?; + VmcsGuestNW::IA32_SYSENTER_EIP.write(linux.ia32_sysenter_eip as _)?; VmcsGuestNW::DR7.write(0x400)?; VmcsGuest64::IA32_DEBUGCTL.write(0)?; @@ -895,13 +895,11 @@ impl VmxVcpu { linux.idt.base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read().unwrap() as _); linux.idt.limit = VmcsGuest32::IDTR_LIMIT.read().unwrap() as _; - linux.load_guest_regs(self.regs()); + linux.ia32_sysenter_cs = VmcsGuest32::IA32_SYSENTER_CS.read().unwrap() as _; // 0x174 + linux.ia32_sysenter_esp = VmcsGuestNW::IA32_SYSENTER_ESP.read().unwrap() as _; // 0x178 + linux.ia32_sysenter_eip = VmcsGuestNW::IA32_SYSENTER_EIP.read().unwrap() as _; // 0x17a - // unsafe { - // Msr::IA32_SYSENTER_CS.write(VmcsGuest32::IA32_SYSENTER_CS.read().unwrap() as _); - // Msr::IA32_SYSENTER_ESP.write(VmcsGuestNW::IA32_SYSENTER_ESP.read().unwrap() as _); - // Msr::IA32_SYSENTER_EIP.write(VmcsGuestNW::IA32_SYSENTER_EIP.read().unwrap() as _); - // } + linux.load_guest_regs(self.regs()); } fn get_paging_level(&self) -> usize { @@ -1400,10 +1398,9 @@ impl AxArchVCpu for VmxVcpu { Self::new(Some(config)) } - fn load_host(&self) -> AxResult { - let mut linux = LinuxContext::default(); - self.load_vmcs_guest(&mut linux); - Ok(linux) + fn load_host(&self, config: &mut Self::HostConfig) -> AxResult { + self.load_vmcs_guest(config); + Ok(()) } fn set_entry(&mut self, entry: GuestPhysAddr) -> AxResult { From e9d5bcee189c54e70ff80188d33f532e129cbd6f Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Tue, 15 Apr 2025 21:23:13 +0800 Subject: [PATCH 12/20] [refactor] pass host ctx during setup --- src/page_table.rs | 17 +++++---- src/regs.rs | 9 +++++ src/vmx/vcpu.rs | 87 ++++++++++++++++++++++++++++------------------- 3 files changed, 71 insertions(+), 42 deletions(-) diff --git a/src/page_table.rs b/src/page_table.rs index 1550da6..50a8d4b 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -128,8 +128,11 @@ impl GuestPageTable64 GuestPageTable64 { - fn table_of<'a>(&self, gpa: GuestPhysAddr) -> &'a [PTE] { - let hpa = EPT::guest_phys_to_host_phys(gpa).unwrap(); + fn table_of<'a>(&self, gpa: GuestPhysAddr) -> PagingResult<&'a [PTE]> { + let hpa = EPT::guest_phys_to_host_phys(gpa).ok_or_else(|| { + warn!("Failed to translate GPA {:?}", gpa); + PagingError::NotMapped + })?; let ptr = H::phys_to_virt(hpa).as_ptr() as _; // debug!( @@ -137,7 +140,7 @@ impl GuestPageTable64(&self, entry: &PTE) -> PagingResult<&'a [PTE]> { @@ -146,7 +149,7 @@ impl GuestPageTable64 GuestPageTable64 { cur_xstate: XState, entry: Option, ept_root: Option, - host_ctx: Option, } impl VmxVcpu { /// Create a new [`VmxVcpu`]. - pub fn new(ctx: Option) -> AxResult { + pub fn new() -> AxResult { let vcpu = Self { - guest_regs: if let Some(ctx) = ctx { - GeneralRegisters::from_context(&ctx) - } else { - GeneralRegisters::default() - }, + guest_regs: GeneralRegisters::default(), host_stack_top: 0, launched: false, vmcs: VmxRegion::new(read_vmcs_revision_id(), false)?, @@ -193,21 +188,12 @@ impl VmxVcpu { msr_bitmap: MsrBitmap::passthrough_all()?, eptp_list: EptpList::new()?, pending_events: VecDeque::with_capacity(8), - guest_xstate: if let Some(ctx) = ctx { - ctx.xstate - } else { - XState::new() - }, + guest_xstate: XState::new(), cur_xstate: XState::new(), entry: None, ept_root: None, - host_ctx: ctx, }; - debug!( - "[HV] created {} VmxVcpu(vmcs: {:#x})", - if ctx.is_some() { "Host" } else { "Guest" }, - vcpu.vmcs.phys_addr(), - ); + debug!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr(),); Ok(vcpu) } @@ -452,7 +438,10 @@ impl VmxVcpu { while remained_size > 0 { let (gpa, _flags, page_size) = self.guest_page_table_query(gva).map_err(|e| { - warn!("Failed to query guest page table: {:?}", e); + warn!( + "Failed to query guest page table, GVA {:?} err {:?}", + gva, e + ); ax_err_type!(BadAddress) })?; let pgoff = page_size.align_offset(addr.into()); @@ -572,18 +561,28 @@ impl VmxVcpu { Ok(()) } - fn setup_vmcs(&mut self, entry: GuestPhysAddr, ept_root: HostPhysAddr) -> AxResult { - // If self.host_ctx.is_none(), it means this is a vcpu for guest VM. - let is_guest = self.host_ctx.is_none(); + fn setup_vmcs( + &mut self, + ept_root: HostPhysAddr, + entry: Option, + ctx: Option, + ) -> AxResult { + let mut is_guest = true; + let paddr = self.vmcs.phys_addr().as_usize() as u64; unsafe { vmx::vmclear(paddr).map_err(as_axerr)?; } self.bind_to_current_processor()?; - if is_guest { - self.setup_vmcs_guest(entry)?; + + if let Some(ctx) = ctx { + is_guest = false; + self.setup_vmcs_guest_from_ctx(ctx)?; } else { - self.setup_vmcs_guest_from_ctx()?; + self.setup_vmcs_guest(entry.ok_or_else(|| { + error!("VmxVcpu::setup_vmcs: entry is None"); + ax_err_type!(InvalidInput) + })?)?; } self.setup_vmcs_control(ept_root, is_guest)?; @@ -631,8 +630,8 @@ impl VmxVcpu { /// Indeed, this function can be combined with `setup_vmcs_guest`, /// to avoid complexity and minimize the modification, /// we just keep them separated. - fn setup_vmcs_guest_from_ctx(&mut self) -> AxResult { - let linux = self.host_ctx.expect("Host context is not set"); + fn setup_vmcs_guest_from_ctx(&mut self, host_ctx: LinuxContext) -> AxResult { + let linux = host_ctx; self.set_cr(0, linux.cr0.bits()); self.set_cr(4, linux.cr4.bits()); @@ -1388,17 +1387,13 @@ impl AxArchVCpu for VmxVcpu { type SetupConfig = (); - type HostConfig = crate::context::LinuxContext; + type HostContext = crate::context::LinuxContext; fn new(_config: Self::CreateConfig) -> AxResult { - Self::new(None) - } - - fn new_host(config: Self::HostConfig) -> AxResult { - Self::new(Some(config)) + Self::new() } - fn load_host(&self, config: &mut Self::HostConfig) -> AxResult { + fn load_context(&self, config: &mut Self::HostContext) -> AxResult { self.load_vmcs_guest(config); Ok(()) } @@ -1415,7 +1410,12 @@ impl AxArchVCpu for VmxVcpu { } fn setup(&mut self, _config: Self::SetupConfig) -> AxResult { - self.setup_vmcs(self.entry.unwrap(), self.ept_root.unwrap()) + self.setup_vmcs(self.ept_root.unwrap(), self.entry, None) + } + + fn setup_from_context(&mut self, ctx: Self::HostContext) -> AxResult { + self.guest_regs.load_from_context(&ctx); + self.setup_vmcs(self.ept_root.unwrap(), None, Some(ctx)) } fn run(&mut self) -> AxResult { @@ -1483,6 +1483,23 @@ impl AxArchVCpu for VmxVcpu { } } } + VmxExitReason::EPT_VIOLATION => { + let ept_info = self.nested_page_fault_info()?; + + warn!("VMX EPT-Exit: {:#x?} of {:#x?}", ept_info, exit_info); + + warn!("Vcpu {:#x?}", self); + + self.decode_instruction( + GuestVirtAddr::from_usize(exit_info.guest_rip), + exit_info.exit_instruction_length as _, + )?; + + AxVCpuExitReason::NestedPageFault { + addr: ept_info.fault_guest_paddr, + access_flags: ept_info.access_flags, + } + } _ => { warn!("VMX unsupported VM-Exit: {:#x?}", exit_info); warn!("VCpu {:#x?}", self); From d00cf05e58132a90459cc04b89476ff46ac3861f Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 17 Apr 2025 00:00:44 +0800 Subject: [PATCH 13/20] [fix] bug in read_guest_memory --- src/vmx/vcpu.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 9526bd3..5880b72 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -452,7 +452,7 @@ impl VmxVcpu { if let Some(hpa) = H::EPTTranslator::guest_phys_to_host_phys(gpa) { let hva_ptr = H::PagingHandler::phys_to_virt(hpa).as_ptr(); for i in 0..read_size { - content.push(unsafe { hva_ptr.add(pgoff + i).read() }); + content.push(unsafe { hva_ptr.add(i).read() }); } } else { return ax_err!(BadAddress); @@ -1483,7 +1483,7 @@ impl AxArchVCpu for VmxVcpu { } } } - VmxExitReason::EPT_VIOLATION => { + VmxExitReason::EPT_VIOLATION | VmxExitReason::TRIPLE_FAULT => { let ept_info = self.nested_page_fault_info()?; warn!("VMX EPT-Exit: {:#x?} of {:#x?}", ept_info, exit_info); From 1999a0ba34d328f47f92f7ea4702a2bfbf3271a0 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 17 Apr 2025 21:15:07 +0800 Subject: [PATCH 14/20] [feat] introduce vcpu id, maintain cs and ss access rights --- src/vmx/vcpu.rs | 149 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 116 insertions(+), 33 deletions(-) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 5880b72..2b2e812 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -32,7 +32,7 @@ use super::vmcs::{ use crate::LinuxContext; use crate::page_table::GuestPageTable64; use crate::page_table::GuestPageWalkInfo; -use crate::segmentation::Segment; +use crate::segmentation::{Segment, SegmentAccessRights}; use crate::xstate::XState; use crate::{msr::Msr, regs::GeneralRegisters}; @@ -174,11 +174,13 @@ pub struct VmxVcpu { cur_xstate: XState, entry: Option, ept_root: Option, + + id: usize, } impl VmxVcpu { /// Create a new [`VmxVcpu`]. - pub fn new() -> AxResult { + pub fn new(id: usize) -> AxResult { let vcpu = Self { guest_regs: GeneralRegisters::default(), host_stack_top: 0, @@ -192,6 +194,7 @@ impl VmxVcpu { cur_xstate: XState::new(), entry: None, ept_root: None, + id, }; debug!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr(),); Ok(vcpu) @@ -649,6 +652,11 @@ impl VmxVcpu { }}; } + debug!( + "setup_vmcs_guest_from_ctx: CS access rights: {:?}", + linux.cs.access_rights + ); + set_guest_segment!(linux.es, ES); set_guest_segment!(linux.cs, CS); set_guest_segment!(linux.ss, SS); @@ -872,33 +880,42 @@ impl VmxVcpu { Ok(()) } - fn load_vmcs_guest(&self, linux: &mut LinuxContext) { - linux.rip = VmcsGuestNW::RIP.read().unwrap() as _; - linux.rsp = VmcsGuestNW::RSP.read().unwrap() as _; - linux.cr0 = Cr0Flags::from_bits_truncate(VmcsGuestNW::CR0.read().unwrap() as _); - linux.cr3 = VmcsGuestNW::CR3.read().unwrap() as _; - linux.cr4 = Cr4Flags::from_bits_truncate(VmcsGuestNW::CR4.read().unwrap() as _); - - linux.es.selector = SegmentSelector::from_raw(VmcsGuest16::ES_SELECTOR.read().unwrap()); - linux.cs.selector = SegmentSelector::from_raw(VmcsGuest16::CS_SELECTOR.read().unwrap()); - linux.ss.selector = SegmentSelector::from_raw(VmcsGuest16::SS_SELECTOR.read().unwrap()); - linux.ds.selector = SegmentSelector::from_raw(VmcsGuest16::DS_SELECTOR.read().unwrap()); - linux.fs.selector = SegmentSelector::from_raw(VmcsGuest16::FS_SELECTOR.read().unwrap()); - linux.fs.base = VmcsGuestNW::FS_BASE.read().unwrap() as _; - linux.gs.selector = SegmentSelector::from_raw(VmcsGuest16::GS_SELECTOR.read().unwrap()); - linux.gs.base = VmcsGuestNW::GS_BASE.read().unwrap() as _; - linux.tss.selector = SegmentSelector::from_raw(VmcsGuest16::TR_SELECTOR.read().unwrap()); - - linux.gdt.base = VirtAddr::new(VmcsGuestNW::GDTR_BASE.read().unwrap() as _); - linux.gdt.limit = VmcsGuest32::GDTR_LIMIT.read().unwrap() as _; - linux.idt.base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read().unwrap() as _); - linux.idt.limit = VmcsGuest32::IDTR_LIMIT.read().unwrap() as _; - - linux.ia32_sysenter_cs = VmcsGuest32::IA32_SYSENTER_CS.read().unwrap() as _; // 0x174 - linux.ia32_sysenter_esp = VmcsGuestNW::IA32_SYSENTER_ESP.read().unwrap() as _; // 0x178 - linux.ia32_sysenter_eip = VmcsGuestNW::IA32_SYSENTER_EIP.read().unwrap() as _; // 0x17a + fn load_vmcs_guest(&self, linux: &mut LinuxContext) -> AxResult { + linux.rip = VmcsGuestNW::RIP.read()? as _; + linux.rsp = VmcsGuestNW::RSP.read()? as _; + linux.cr0 = Cr0Flags::from_bits_truncate(VmcsGuestNW::CR0.read()? as _); + linux.cr3 = VmcsGuestNW::CR3.read()? as _; + linux.cr4 = Cr4Flags::from_bits_truncate(VmcsGuestNW::CR4.read()? as _); + + linux.es.selector = SegmentSelector::from_raw(VmcsGuest16::ES_SELECTOR.read()?); + + linux.cs.selector = SegmentSelector::from_raw(VmcsGuest16::CS_SELECTOR.read()?); + // CS: + // If the Type is 9 or 11 (non-conforming code segment), the DPL must equal the DPL in the access-rights field for SS. + linux.cs.access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::CS_ACCESS_RIGHTS.read()?); + linux.ss.selector = SegmentSelector::from_raw(VmcsGuest16::SS_SELECTOR.read()?); + linux.ss.access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::SS_ACCESS_RIGHTS.read()?); + + linux.ds.selector = SegmentSelector::from_raw(VmcsGuest16::DS_SELECTOR.read()?); + linux.fs.selector = SegmentSelector::from_raw(VmcsGuest16::FS_SELECTOR.read()?); + linux.fs.base = VmcsGuestNW::FS_BASE.read()? as _; + linux.gs.selector = SegmentSelector::from_raw(VmcsGuest16::GS_SELECTOR.read()?); + linux.gs.base = VmcsGuestNW::GS_BASE.read()? as _; + linux.tss.selector = SegmentSelector::from_raw(VmcsGuest16::TR_SELECTOR.read()?); + + linux.gdt.base = VirtAddr::new(VmcsGuestNW::GDTR_BASE.read()? as _); + linux.gdt.limit = VmcsGuest32::GDTR_LIMIT.read()? as _; + linux.idt.base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read()? as _); + linux.idt.limit = VmcsGuest32::IDTR_LIMIT.read()? as _; + + linux.ia32_sysenter_cs = VmcsGuest32::IA32_SYSENTER_CS.read()? as _; // 0x174 + linux.ia32_sysenter_esp = VmcsGuestNW::IA32_SYSENTER_ESP.read()? as _; // 0x178 + linux.ia32_sysenter_eip = VmcsGuestNW::IA32_SYSENTER_EIP.read()? as _; // 0x17a linux.load_guest_regs(self.regs()); + Ok(()) } fn get_paging_level(&self) -> usize { @@ -1364,6 +1381,33 @@ fn get_tr_base(tr: SegmentSelector, gdt: &DescriptorTablePointer) -> u64 { impl Debug for VmxVcpu { fn fmt(&self, f: &mut Formatter) -> Result { (|| -> AxResult { + let cs_selector = SegmentSelector::from_raw(VmcsGuest16::CS_SELECTOR.read()?); + let cs_access_rights_raw = VmcsGuest32::CS_ACCESS_RIGHTS.read()?; + let cs_access_rights = SegmentAccessRights::from_bits_truncate(cs_access_rights_raw); + let ss_selector = SegmentSelector::from_raw(VmcsGuest16::SS_SELECTOR.read()?); + let ss_access_rights_raw = VmcsGuest32::SS_ACCESS_RIGHTS.read()?; + let ss_access_rights = SegmentAccessRights::from_bits_truncate(ss_access_rights_raw); + let ds_selector = SegmentSelector::from_raw(VmcsGuest16::DS_SELECTOR.read()?); + let ds_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::DS_ACCESS_RIGHTS.read()?); + let fs_selector = SegmentSelector::from_raw(VmcsGuest16::FS_SELECTOR.read()?); + let fs_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::FS_ACCESS_RIGHTS.read()?); + let gs_selector = SegmentSelector::from_raw(VmcsGuest16::GS_SELECTOR.read()?); + let gs_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::GS_ACCESS_RIGHTS.read()?); + let tr_selector = SegmentSelector::from_raw(VmcsGuest16::TR_SELECTOR.read()?); + let tr_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::TR_ACCESS_RIGHTS.read()?); + let gdt_base = VirtAddr::new(VmcsGuestNW::GDTR_BASE.read()? as _); + let gdt_limit = VmcsGuest32::GDTR_LIMIT.read()?; + let idt_base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read()? as _); + let idt_limit = VmcsGuest32::IDTR_LIMIT.read()?; + + let ia32_sysenter_cs = VmcsGuest32::IA32_SYSENTER_CS.read()?; + let ia32_sysenter_esp = VmcsGuestNW::IA32_SYSENTER_ESP.read()?; + let ia32_sysenter_eip = VmcsGuestNW::IA32_SYSENTER_EIP.read()?; + Ok(f.debug_struct("VmxVcpu") .field("guest_regs", &self.guest_regs) .field("rip", &VmcsGuestNW::RIP.read()?) @@ -1372,10 +1416,32 @@ impl Debug for VmxVcpu { .field("cr0", &VmcsGuestNW::CR0.read()?) .field("cr3", &VmcsGuestNW::CR3.read()?) .field("cr4", &VmcsGuestNW::CR4.read()?) - .field("cs", &VmcsGuest16::CS_SELECTOR.read()?) + .field("cs_base", &VmcsGuestNW::CS_BASE.read()?) + .field("cs_selector", &cs_selector) + .field("cs_access_rights", &cs_access_rights) + .field("cs_access_rights_raw", &cs_access_rights_raw) + .field("ss_base", &VmcsGuestNW::SS_BASE.read()?) + .field("ss_selector", &ss_selector) + .field("ss_access_rights_raw", &ss_access_rights_raw) + .field("ss_access_rights", &ss_access_rights) + .field("ds_base", &VmcsGuestNW::DS_BASE.read()?) + .field("ds_selector", &ds_selector) + .field("ds_access_rights", &ds_access_rights) .field("fs_base", &VmcsGuestNW::FS_BASE.read()?) + .field("fs_selector", &fs_selector) + .field("fs_access_rights", &fs_access_rights) .field("gs_base", &VmcsGuestNW::GS_BASE.read()?) - .field("tss", &VmcsGuest16::TR_SELECTOR.read()?) + .field("gs_selector", &gs_selector) + .field("gs_access_rights", &gs_access_rights) + .field("tr_selector", &tr_selector) + .field("tr_access_rights", &tr_access_rights) + .field("gdt_base", &gdt_base) + .field("gdt_limit", &gdt_limit) + .field("idt_base", &idt_base) + .field("idt_limit", &idt_limit) + .field("ia32_sysenter_cs", &ia32_sysenter_cs) + .field("ia32_sysenter_esp", &ia32_sysenter_esp) + .field("ia32_sysenter_eip", &ia32_sysenter_eip) .finish()) })() .unwrap() @@ -1383,18 +1449,20 @@ impl Debug for VmxVcpu { } impl AxArchVCpu for VmxVcpu { - type CreateConfig = (); + type CreateConfig = usize; type SetupConfig = (); type HostContext = crate::context::LinuxContext; - fn new(_config: Self::CreateConfig) -> AxResult { - Self::new() + fn new(id: Self::CreateConfig) -> AxResult { + Self::new(id) } fn load_context(&self, config: &mut Self::HostContext) -> AxResult { - self.load_vmcs_guest(config); + info!("Loading context {:#x?}", self); + + self.load_vmcs_guest(config)?; Ok(()) } @@ -1419,8 +1487,23 @@ impl AxArchVCpu for VmxVcpu { } fn run(&mut self) -> AxResult { + if self.id == 3 { + warn!("Instance vcpu run {:#x?}", self); + } + match self.inner_run() { Some(exit_info) => Ok(if exit_info.entry_failure { + match exit_info.exit_reason { + VmxExitReason::INVALID_GUEST_STATE + | VmxExitReason::MCE_DURING_VMENTRY + | VmxExitReason::MSR_LOAD_FAIL => {} + _ => { + error!("Invalid exit reasion when entry failure: {:#x?}", exit_info); + } + }; + + warn!("VMX entry failure: {:#x?}", exit_info); + AxVCpuExitReason::FailEntry { // Todo: get `hardware_entry_failure_reason` somehow. hardware_entry_failure_reason: 0, From 388fc8372f05561a3f31e9aa97d5c44d62da2b25 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Fri, 18 Apr 2025 19:11:47 +0800 Subject: [PATCH 15/20] [refactor] remove redundant eptp related apis --- src/vmx/structs.rs | 31 -------------------------- src/vmx/vcpu.rs | 54 ++++++---------------------------------------- src/vmx/vmcs.rs | 4 ++++ 3 files changed, 10 insertions(+), 79 deletions(-) diff --git a/src/vmx/structs.rs b/src/vmx/structs.rs index 72e642c..1056ed3 100644 --- a/src/vmx/structs.rs +++ b/src/vmx/structs.rs @@ -271,8 +271,6 @@ impl EPTPointer { } } -pub const EPTP_LIST_SIZE: usize = 512; - /// EPTP list, the 4-KByte structure, /// The EPTP list comprises 512 8-Byte entries (each an EPTP value) /// and is used by the EPTP-switching VM function (see Section 26.5.6.3). @@ -290,33 +288,4 @@ impl EptpList { pub fn phys_addr(&self) -> HostPhysAddr { self.frame.start_paddr() } - - pub fn set_entry(&mut self, idx: usize, eptp: EPTPointer) { - assert!(idx < EPTP_LIST_SIZE); - // Todo: validate eptp refer to 26.5.6.3 EPTP Switching. - let ptr = self.frame.as_mut_ptr() as *mut u64; - unsafe { - ptr.add(idx).write(eptp.bits()); - } - } - - pub fn entry_is_set(&self, idx: usize) -> bool { - assert!(idx < EPTP_LIST_SIZE); - let ptr = self.frame.as_mut_ptr() as *const u64; - unsafe { ptr.add(idx).read() != 0 } - } - - pub fn get_entry(&self, idx: usize) -> EPTPointer { - assert!(idx < EPTP_LIST_SIZE); - let ptr = self.frame.as_mut_ptr() as *const u64; - unsafe { EPTPointer::from_bits_truncate(ptr.add(idx).read()) } - } - - pub fn remove_entry(&mut self, idx: usize) { - assert!(idx < EPTP_LIST_SIZE); - let ptr = self.frame.as_mut_ptr() as *mut u64; - unsafe { - ptr.add(idx).write(0); - } - } } diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 2b2e812..2be345c 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -24,7 +24,7 @@ use super::VmxExitInfo; use super::as_axerr; use super::definitions::VmxExitReason; use super::read_vmcs_revision_id; -use super::structs::{EPTP_LIST_SIZE, EPTPointer, EptpList, IOBitmap, MsrBitmap, VmxRegion}; +use super::structs::{EptpList, IOBitmap, MsrBitmap, VmxRegion}; use super::vmcs::{ self, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16, VmcsGuest32, VmcsGuest64, VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, interrupt_exit_info, @@ -861,11 +861,6 @@ impl VmxVcpu { // Bit 0: EPTP switching VmcsControl64::VM_FUNCTION_CONTROLS.write(0b1)?; - assert!( - self.eptp_list.entry_is_set(0), - "The First EPTP list entry must be set as initial EPTP." - ); - VmcsControl64::EPTP_LIST_ADDR.write(self.eptp_list.phys_addr().as_usize() as _)?; // Pass-through exceptions (except #UD(6)), don't use I/O bitmap, set MSR bitmaps. @@ -1473,7 +1468,6 @@ impl AxArchVCpu for VmxVcpu { fn set_ept_root(&mut self, ept_root: HostPhysAddr) -> AxResult { self.ept_root = Some(ept_root); - self.append_eptp_list(0, ept_root)?; Ok(()) } @@ -1487,10 +1481,6 @@ impl AxArchVCpu for VmxVcpu { } fn run(&mut self) -> AxResult { - if self.id == 3 { - warn!("Instance vcpu run {:#x?}", self); - } - match self.inner_run() { Some(exit_info) => Ok(if exit_info.entry_failure { match exit_info.exit_reason { @@ -1503,6 +1493,7 @@ impl AxArchVCpu for VmxVcpu { }; warn!("VMX entry failure: {:#x?}", exit_info); + warn!("VCpu {:#x?}", self); AxVCpuExitReason::FailEntry { // Todo: get `hardware_entry_failure_reason` somehow. @@ -1677,44 +1668,11 @@ impl AxVcpuAccessGuestState for VmxVcpu { self.guest_page_table_query(gva).ok() } - fn append_eptp_list(&mut self, idx: usize, eptp: HostPhysAddr) -> AxResult { - if idx >= EPTP_LIST_SIZE { - return ax_err!(InvalidInput); - } - - if self.eptp_list.entry_is_set(idx) { - return ax_err!(InvalidInput); - } - - self.eptp_list - .set_entry(idx, EPTPointer::from_table_phys(eptp)); - Ok(()) + fn current_ept_root(&self) -> HostPhysAddr { + vmcs::get_ept_pointer() } - fn remove_eptp_list_entry(&mut self, idx: usize) -> AxResult { - if idx >= EPTP_LIST_SIZE { - return ax_err!(InvalidInput); - } - if !self.eptp_list.entry_is_set(idx) { - return ax_err!(InvalidInput); - } - - self.eptp_list.remove_entry(idx); - - Ok(()) - } - - fn get_eptp_list_entry(&self, idx: usize) -> AxResult { - if idx >= EPTP_LIST_SIZE { - return ax_err!(InvalidInput); - } - if !self.eptp_list.entry_is_set(idx) { - return ax_err!(InvalidInput); - } - let entry = self.eptp_list.get_entry(idx); - - Ok(HostPhysAddr::from_usize(memory_addr::align_down_4k( - entry.bits() as _, - ))) + fn eptp_list_region(&self) -> HostPhysAddr { + self.eptp_list.phys_addr() } } diff --git a/src/vmx/vmcs.rs b/src/vmx/vmcs.rs index 4b31463..f681245 100644 --- a/src/vmx/vmcs.rs +++ b/src/vmx/vmcs.rs @@ -638,6 +638,10 @@ pub fn set_ept_pointer(pml4_paddr: HostPhysAddr) -> AxResult { Ok(()) } +pub fn get_ept_pointer() -> HostPhysAddr { + HostPhysAddr::from(VmcsControl64::EPTP.read().expect("Failed to read EPTP") as usize) +} + pub fn instruction_error() -> VmxInstructionError { VmcsReadOnly32::VM_INSTRUCTION_ERROR.read().unwrap().into() } From 13c4be7f4c173a1e70a868dbf7cc8a8484caedb9 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Mon, 21 Apr 2025 11:07:55 +0800 Subject: [PATCH 16/20] [refactor] modify guest_phys_to_host_phys --- src/page_table.rs | 15 ++++++--------- src/vmx/vcpu.rs | 2 +- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/page_table.rs b/src/page_table.rs index 50a8d4b..3dce2d9 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -129,17 +129,14 @@ impl GuestPageTable64 GuestPageTable64 { fn table_of<'a>(&self, gpa: GuestPhysAddr) -> PagingResult<&'a [PTE]> { - let hpa = EPT::guest_phys_to_host_phys(gpa).ok_or_else(|| { - warn!("Failed to translate GPA {:?}", gpa); - PagingError::NotMapped - })?; + let hpa = EPT::guest_phys_to_host_phys(gpa) + .map(|(hpa, _flags, _pgsize)| hpa) + .ok_or_else(|| { + warn!("Failed to translate GPA {:?}", gpa); + PagingError::NotMapped + })?; let ptr = H::phys_to_virt(hpa).as_ptr() as _; - // debug!( - // "GuestPageTable64::table_of gpa: {:?} hpa: {:?} ptr: {:p}", - // gpa, hpa, ptr - // ); - Ok(unsafe { core::slice::from_raw_parts(ptr, ENTRY_COUNT) }) } diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 2be345c..382965e 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -452,7 +452,7 @@ impl VmxVcpu { addr += read_size; remained_size -= read_size; - if let Some(hpa) = H::EPTTranslator::guest_phys_to_host_phys(gpa) { + if let Some((hpa, _flags, _pgsize)) = H::EPTTranslator::guest_phys_to_host_phys(gpa) { let hva_ptr = H::PagingHandler::phys_to_virt(hpa).as_ptr(); for i in 0..read_size { content.push(unsafe { hva_ptr.add(i).read() }); From 5f58a3f5ab1a011a82dea65d1328071873e2676f Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Wed, 23 Apr 2025 09:36:40 +0800 Subject: [PATCH 17/20] [feat] improve features setting in SecondaryControls --- src/vmx/vcpu.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 382965e..7926a78 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -783,25 +783,30 @@ impl VmxVcpu { // Enable EPT, RDTSCP, INVPCID, and unrestricted guest. use SecondaryControls as CpuCtrl2; - let mut val = CpuCtrl2::ENABLE_EPT - | CpuCtrl2::UNRESTRICTED_GUEST - | CpuCtrl2::ENABLE_USER_WAIT_PAUSE - | CpuCtrl2::ENABLE_VM_FUNCTIONS; + let mut val = + CpuCtrl2::ENABLE_EPT | CpuCtrl2::UNRESTRICTED_GUEST | CpuCtrl2::ENABLE_VM_FUNCTIONS; + if let Some(features) = raw_cpuid.get_extended_processor_and_feature_identifiers() { if features.has_rdtscp() { val |= CpuCtrl2::ENABLE_RDTSCP; } } + if let Some(features) = raw_cpuid.get_extended_feature_info() { if features.has_invpcid() { val |= CpuCtrl2::ENABLE_INVPCID; } + if features.has_waitpkg() { + val |= CpuCtrl2::ENABLE_USER_WAIT_PAUSE; + } } + if let Some(features) = raw_cpuid.get_extended_state_info() { if features.has_xsaves_xrstors() { val |= CpuCtrl2::ENABLE_XSAVES_XRSTORS; } } + vmcs::set_control( VmcsControl32::SECONDARY_PROCBASED_EXEC_CONTROLS, Msr::IA32_VMX_PROCBASED_CTLS2, From 09256165870298a0cd9e887abbe2d5841ac8704b Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Fri, 25 Apr 2025 11:42:21 +0800 Subject: [PATCH 18/20] [feat] add exit_qualification api --- src/page_table.rs | 3 +++ src/vmx/vcpu.rs | 41 +++++++++++++++-------------------------- src/vmx/vmcs.rs | 4 ++++ 3 files changed, 22 insertions(+), 26 deletions(-) diff --git a/src/page_table.rs b/src/page_table.rs index 3dce2d9..4263c87 100644 --- a/src/page_table.rs +++ b/src/page_table.rs @@ -119,6 +119,7 @@ impl GuestPageTable64 PagingResult<(GuestPhysAddr, MappingFlags, PageSize)> { let (entry, size) = self.get_entry(vaddr)?; if entry.is_unused() { + error!("GuestPT64 query {:?} Entry is unused", vaddr); return Err(PagingError::NotMapped); } let off = size.align_offset(vaddr.into()); @@ -142,8 +143,10 @@ impl GuestPageTable64(&self, entry: &PTE) -> PagingResult<&'a [PTE]> { if !entry.is_present() { + error!("GuestPT64 next_table {:?} Entry is not present", entry); Err(PagingError::NotMapped) } else if entry.is_huge() { + error!("GuestPT64 next_table {:?} Entry is huge", entry); Err(PagingError::MappedToHugePage) } else { self.table_of(entry.paddr().into()) diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 7926a78..98aebc5 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -2,7 +2,6 @@ use alloc::collections::VecDeque; use alloc::vec::Vec; use core::fmt::{Debug, Formatter, Result}; use core::{arch::naked_asm, mem::size_of}; -use x86_64::VirtAddr; use bit_field::BitField; use raw_cpuid::CpuId; @@ -10,6 +9,7 @@ use x86::bits64::vmx; use x86::controlregs::Xcr0; use x86::dtables::{self, DescriptorTablePointer}; use x86::segmentation::SegmentSelector; +use x86_64::VirtAddr; use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr4, Cr4Flags, EferFlags}; use page_table_entry::x86_64::X64PTE; @@ -27,7 +27,8 @@ use super::read_vmcs_revision_id; use super::structs::{EptpList, IOBitmap, MsrBitmap, VmxRegion}; use super::vmcs::{ self, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16, VmcsGuest32, VmcsGuest64, - VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, interrupt_exit_info, + VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, exit_qualification, + interrupt_exit_info, }; use crate::LinuxContext; use crate::page_table::GuestPageTable64; @@ -467,7 +468,7 @@ impl VmxVcpu { pub fn decode_instruction(&self, rip: GuestVirtAddr, instr_len: usize) -> AxResult { use alloc::string::String; - use iced_x86::{Decoder, DecoderOptions, Formatter, IntelFormatter, MasmFormatter}; + use iced_x86::{Decoder, DecoderOptions, Formatter, IntelFormatter}; let bytes = self.read_guest_memory(rip, instr_len)?; let mut decoder = Decoder::with_ip( @@ -477,21 +478,11 @@ impl VmxVcpu { DecoderOptions::NONE, ); let instr = decoder.decode(); - - debug!("Decoded instruction: {:#x?}", instr); - let mut output = String::new(); let mut formattor = IntelFormatter::new(); formattor.format(&instr, &mut output); - debug!("Decoded instruction Intel formatter: {}", output); - - let mut output = String::new(); - let mut formattor = MasmFormatter::new(); - formattor.format(&instr, &mut output); - - debug!("Decoded instruction MasmFormatter: {}", output); - + debug!("Decoded instruction @Intel formatter: {}", output); Ok(()) } } @@ -652,11 +643,6 @@ impl VmxVcpu { }}; } - debug!( - "setup_vmcs_guest_from_ctx: CS access rights: {:?}", - linux.cs.access_rights - ); - set_guest_segment!(linux.es, ES); set_guest_segment!(linux.cs, CS); set_guest_segment!(linux.ss, SS); @@ -1460,7 +1446,7 @@ impl AxArchVCpu for VmxVcpu { } fn load_context(&self, config: &mut Self::HostContext) -> AxResult { - info!("Loading context {:#x?}", self); + // info!("Loading context {:#x?}", self); self.load_vmcs_guest(config)?; Ok(()) @@ -1497,7 +1483,10 @@ impl AxArchVCpu for VmxVcpu { } }; + let exit_qualification = exit_qualification()?; + warn!("VMX entry failure: {:#x?}", exit_info); + warn!("Exit qualification: {:#x?}", exit_qualification); warn!("VCpu {:#x?}", self); AxVCpuExitReason::FailEntry { @@ -1562,13 +1551,8 @@ impl AxArchVCpu for VmxVcpu { } } } - VmxExitReason::EPT_VIOLATION | VmxExitReason::TRIPLE_FAULT => { + VmxExitReason::EPT_VIOLATION => { let ept_info = self.nested_page_fault_info()?; - - warn!("VMX EPT-Exit: {:#x?} of {:#x?}", ept_info, exit_info); - - warn!("Vcpu {:#x?}", self); - self.decode_instruction( GuestVirtAddr::from_usize(exit_info.guest_rip), exit_info.exit_instruction_length as _, @@ -1579,6 +1563,11 @@ impl AxArchVCpu for VmxVcpu { access_flags: ept_info.access_flags, } } + VmxExitReason::TRIPLE_FAULT => { + error!("VMX triple fault: {:#x?}", exit_info); + error!("VCpu {:#x?}", self); + AxVCpuExitReason::Halt + } _ => { warn!("VMX unsupported VM-Exit: {:#x?}", exit_info); warn!("VCpu {:#x?}", self); diff --git a/src/vmx/vmcs.rs b/src/vmx/vmcs.rs index f681245..8b6e648 100644 --- a/src/vmx/vmcs.rs +++ b/src/vmx/vmcs.rs @@ -776,3 +776,7 @@ pub fn cr_access_info() -> AxResult { lmsw_source_data: qualification.get_bits(16..32) as u8, }) } + +pub fn exit_qualification() -> AxResult { + VmcsReadOnlyNW::EXIT_QUALIFICATION.read() +} From ab5d872a501f239997885c8f6abb88871dadf504 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Wed, 30 Apr 2025 20:23:53 +0800 Subject: [PATCH 19/20] [feat] support construct_guest64 for LinuxContext --- src/context.rs | 94 +++++++++++++++++++++++++++++++++++++++++++++---- src/vmx/vcpu.rs | 2 +- 2 files changed, 89 insertions(+), 7 deletions(-) diff --git a/src/context.rs b/src/context.rs index b729197..cc8d6c9 100644 --- a/src/context.rs +++ b/src/context.rs @@ -1,7 +1,8 @@ -use x86::{segmentation, task}; +use x86::segmentation::SegmentSelector; +use x86::{Ring, segmentation, task}; use x86_64::VirtAddr; use x86_64::instructions::tables::{lgdt, lidt, sidt}; -use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr3Flags, Cr4, Cr4Flags}; +use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr3Flags, Cr4, Cr4Flags, Efer, EferFlags}; use x86_64::structures::DescriptorTablePointer; use x86_64::{addr::PhysAddr, structures::paging::PhysFrame}; @@ -38,7 +39,7 @@ pub struct LinuxContext { pub cr3: u64, pub cr4: Cr4Flags, - pub efer: u64, + pub efer: EferFlags, pub star: u64, pub lstar: u64, pub cstar: u64, @@ -87,7 +88,7 @@ impl Default for LinuxContext { cr0: Cr0Flags::empty(), cr3: 0, cr4: Cr4Flags::empty(), - efer: 0, + efer: EferFlags::empty(), star: 0, lstar: 0, cstar: 0, @@ -146,7 +147,7 @@ impl LinuxContext { cr0: Cr0::read(), cr3: Cr3::read().0.start_address().as_u64(), cr4: Cr4::read(), - efer: Msr::IA32_EFER.read(), + efer: Efer::read(), star: Msr::IA32_STAR.read(), lstar: Msr::IA32_LSTAR.read(), cstar: Msr::IA32_CSTAR.read(), @@ -161,6 +162,87 @@ impl LinuxContext { } } + pub fn construct_guest64(rip: u64, cr3: u64) -> Self { + Self { + rsp: 0, + rip, + r15: 0, + r14: 0, + r13: 0, + r12: 0, + rbx: 0, + rbp: 0, + es: Segment::invalid(), + cs: Segment { + selector: SegmentSelector::new(1, Ring::Ring0), + base: 0, + limit: 0xffff, + access_rights: SegmentAccessRights::ACCESSED + | SegmentAccessRights::WRITABLE + | SegmentAccessRights::EXECUTABLE + | SegmentAccessRights::CODE_DATA + | SegmentAccessRights::PRESENT + | SegmentAccessRights::LONG_MODE + | SegmentAccessRights::GRANULARITY, + }, + ss: Segment { + selector: SegmentSelector::new(2, Ring::Ring0), + base: 0, + limit: 0xffff, + access_rights: SegmentAccessRights::ACCESSED + | SegmentAccessRights::WRITABLE + | SegmentAccessRights::CODE_DATA + | SegmentAccessRights::PRESENT + | SegmentAccessRights::DB + | SegmentAccessRights::GRANULARITY, + }, + ds: Segment::invalid(), + fs: Segment::invalid(), + gs: Segment::invalid(), + tss: Segment { + selector: SegmentSelector::new(2, Ring::Ring0), + base: 0, + limit: 0, + access_rights: SegmentAccessRights::ACCESSED + | SegmentAccessRights::WRITABLE + | SegmentAccessRights::EXECUTABLE + | SegmentAccessRights::PRESENT, + }, + gdt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + idt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + cr0: Cr0Flags::PROTECTED_MODE_ENABLE + | Cr0Flags::MONITOR_COPROCESSOR + | Cr0Flags::EXTENSION_TYPE + | Cr0Flags::NUMERIC_ERROR + | Cr0Flags::WRITE_PROTECT + | Cr0Flags::ALIGNMENT_MASK + | Cr0Flags::PAGING, + cr3, + cr4: Cr4Flags::PHYSICAL_ADDRESS_EXTENSION | Cr4Flags::PAGE_GLOBAL, + efer: EferFlags::LONG_MODE_ENABLE + | EferFlags::LONG_MODE_ACTIVE + | EferFlags::NO_EXECUTE_ENABLE + | EferFlags::SYSTEM_CALL_EXTENSIONS, + star: 0, + lstar: 0, + cstar: 0, + fmask: 0, + ia32_sysenter_cs: 0, + ia32_sysenter_esp: 0, + ia32_sysenter_eip: 0, + kernel_gsbase: 0, + pat: 0, + mtrr_def_type: 0, + xstate: XState::default(), + } + } + /// Restore system registers. pub fn restore(&self) { unsafe { @@ -168,7 +250,7 @@ impl LinuxContext { Msr::IA32_SYSENTER_ESP.write(self.ia32_sysenter_esp); Msr::IA32_SYSENTER_EIP.write(self.ia32_sysenter_eip); - Msr::IA32_EFER.write(self.efer); + Efer::write(self.efer); Msr::IA32_STAR.write(self.star); Msr::IA32_LSTAR.write(self.lstar); Msr::IA32_CSTAR.write(self.cstar); diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 98aebc5..8316f6f 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -676,7 +676,7 @@ impl VmxVcpu { VmcsGuest32::VMX_PREEMPTION_TIMER_VALUE.write(0)?; VmcsGuest64::IA32_PAT.write(linux.pat)?; - VmcsGuest64::IA32_EFER.write(linux.efer)?; + VmcsGuest64::IA32_EFER.write(linux.efer.bits())?; Ok(()) } From 3806bce2d70a5b9fe67ddeb9e8f4fa4e7e9d6155 Mon Sep 17 00:00:00 2001 From: hky1999 <976929993@qq.com> Date: Thu, 1 May 2025 16:50:39 +0800 Subject: [PATCH 20/20] [fixme] rebase, problem in xstate --- src/context.rs | 7 +-- src/vmx/vcpu.rs | 116 +++--------------------------------------------- src/xstate.rs | 112 +++++++++++++++++++++++++++++++++++----------- 3 files changed, 92 insertions(+), 143 deletions(-) diff --git a/src/context.rs b/src/context.rs index cc8d6c9..f72ec9a 100644 --- a/src/context.rs +++ b/src/context.rs @@ -9,7 +9,6 @@ use x86_64::{addr::PhysAddr, structures::paging::PhysFrame}; use crate::msr::Msr; use crate::regs::GeneralRegisters; use crate::segmentation::{Segment, SegmentAccessRights}; -use crate::xstate::XState; const SAVED_LINUX_REGS: usize = 8; @@ -52,8 +51,7 @@ pub struct LinuxContext { pub kernel_gsbase: u64, pub pat: u64, pub mtrr_def_type: u64, - - pub xstate: XState, + // TODO: xstate } unsafe impl Send for LinuxContext {} @@ -99,7 +97,6 @@ impl Default for LinuxContext { kernel_gsbase: 0, pat: 0, mtrr_def_type: 0, - xstate: XState::default(), } } } @@ -158,7 +155,6 @@ impl LinuxContext { kernel_gsbase: Msr::IA32_KERNEL_GSBASE.read(), pat: Msr::IA32_PAT.read(), mtrr_def_type: Msr::IA32_MTRR_DEF_TYPE.read(), - xstate: XState::new(), } } @@ -239,7 +235,6 @@ impl LinuxContext { kernel_gsbase: 0, pat: 0, mtrr_def_type: 0, - xstate: XState::default(), } } diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index 8316f6f..cd49a4c 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -42,16 +42,6 @@ const VMX_PREEMPTION_TIMER_SET_VALUE: u32 = 1_000_000; const QEMU_EXIT_PORT: u16 = 0x604; const QEMU_EXIT_MAGIC: u64 = 0x2000; -pub struct XState { - host_xcr0: u64, - guest_xcr0: u64, - host_xss: u64, - guest_xss: u64, - - xsave_available: bool, - xsaves_available: bool, -} - #[derive(PartialEq, Eq, Debug)] pub enum VmCpuMode { Real, @@ -60,97 +50,6 @@ pub enum VmCpuMode { Mode64, // IA-32E mode (CS.L = 1) } -impl XState { - /// Create a new [`XState`] instance with current host state - fn new() -> Self { - // Check if XSAVE is available - let xsave_available = Self::xsave_available(); - // Check if XSAVES and XRSTORS (as well as IA32_XSS) are available - let xsaves_available = if xsave_available { - Self::xsaves_available() - } else { - false - }; - - // Read XCR0 iff XSAVE is available - let xcr0 = if xsave_available { - unsafe { xcr0_read().bits() } - } else { - 0 - }; - // Read IA32_XSS iff XSAVES is available - let xss = if xsaves_available { - Msr::IA32_XSS.read() - } else { - 0 - }; - - Self { - host_xcr0: xcr0, - guest_xcr0: xcr0, - host_xss: xss, - guest_xss: xss, - xsave_available, - xsaves_available, - } - } - - /// Enable extended processor state management instructions, including XGETBV and XSAVE. - pub fn enable_xsave() { - if Self::xsave_available() { - unsafe { Cr4::write(Cr4::read() | Cr4Flags::OSXSAVE) }; - } - } - - /// Check if XSAVE is available on the current CPU. - pub fn xsave_available() -> bool { - let cpuid = CpuId::new(); - cpuid - .get_feature_info() - .map(|f| f.has_xsave()) - .unwrap_or(false) - } - - /// Check if XSAVES and XRSTORS (as well as IA32_XSS) are available on the current CPU. - pub fn xsaves_available() -> bool { - let cpuid = CpuId::new(); - cpuid - .get_extended_state_info() - .map(|f| f.has_xsaves_xrstors()) - .unwrap_or(false) - } - - /// Save the current host XCR0 and IA32_XSS values and load the guest values. - pub fn switch_to_guest(&mut self) { - unsafe { - if self.xsave_available { - self.host_xcr0 = xcr0_read().bits(); - xcr0_write(Xcr0::from_bits_unchecked(self.guest_xcr0)); - - if self.xsaves_available { - self.host_xss = Msr::IA32_XSS.read(); - Msr::IA32_XSS.write(self.guest_xss); - } - } - } - } - - /// Save the current guest XCR0 and IA32_XSS values and load the host values. - pub fn switch_to_host(&mut self) { - unsafe { - if self.xsave_available { - self.guest_xcr0 = xcr0_read().bits(); - xcr0_write(Xcr0::from_bits_unchecked(self.host_xcr0)); - - if self.xsaves_available { - self.guest_xss = Msr::IA32_XSS.read(); - Msr::IA32_XSS.write(self.host_xss); - } - } - } - } -} - const MSR_IA32_EFER_LMA_BIT: u64 = 1 << 10; const CR0_PE: usize = 1 << 0; @@ -169,10 +68,7 @@ pub struct VmxVcpu { pending_events: VecDeque<(u8, Option)>, // xstate: XState, - /// XState used by the guest OS, loaded before running the guest. - guest_xstate: XState, - /// XState used by the hypervisor itself, stored before running the guest. - cur_xstate: XState, + xstate: XState, entry: Option, ept_root: Option, @@ -191,8 +87,7 @@ impl VmxVcpu { msr_bitmap: MsrBitmap::passthrough_all()?, eptp_list: EptpList::new()?, pending_events: VecDeque::with_capacity(8), - guest_xstate: XState::new(), - cur_xstate: XState::new(), + xstate: XState::new(), entry: None, ept_root: None, id, @@ -1315,7 +1210,7 @@ impl VmxVcpu { }) .ok_or(ax_err_type!(InvalidInput)) .and_then(|x| { - self.guest_xstate.xcr0 = x; + self.xstate.guest_xcr0 = x.bits(); self.advance_rip(VM_EXIT_INSTR_LEN_XSETBV) }) } else { @@ -1329,7 +1224,8 @@ impl VmxVcpu { /// /// This function is generally called before VM-entry. fn load_guest_xstate(&mut self) { - self.xstate.switch_to_guest(); + // FIXME: Linux will throw a UD exception if we save/restore xstate. + // self.xstate.switch_to_guest(); } /// Save the current guest state to the vcpu, @@ -1337,7 +1233,7 @@ impl VmxVcpu { /// /// This function is generally called after VM-exit. fn load_host_xstate(&mut self) { - self.xstate.switch_to_host(); + // self.xstate.switch_to_host(); } } diff --git a/src/xstate.rs b/src/xstate.rs index a3e3ef4..becb471 100644 --- a/src/xstate.rs +++ b/src/xstate.rs @@ -1,51 +1,109 @@ +use raw_cpuid::CpuId; use x86::controlregs::{Xcr0, xcr0 as xcr0_read, xcr0_write}; use x86_64::registers::control::{Cr4, Cr4Flags}; use crate::msr::Msr; -#[derive(Debug, Clone, Copy)] +#[allow(unused)] pub struct XState { - pub xcr0: Xcr0, - pub xss: u64, -} + host_xcr0: u64, + pub(crate) guest_xcr0: u64, + host_xss: u64, + guest_xss: u64, -impl Default for XState { - fn default() -> Self { - Self { - xcr0: Xcr0::empty(), - xss: 0, - } - } + xsave_available: bool, + xsaves_available: bool, } impl XState { /// Create a new [`XState`] instance with current host state pub fn new() -> Self { + // Check if XSAVE is available + let xsave_available = Self::xsave_available(); + // Check if XSAVES and XRSTORS (as well as IA32_XSS) are available + let xsaves_available = if xsave_available { + Self::xsaves_available() + } else { + false + }; + + // Read XCR0 iff XSAVE is available + let xcr0 = if xsave_available { + unsafe { xcr0_read().bits() } + } else { + 0 + }; + // Read IA32_XSS iff XSAVES is available + let xss = if xsaves_available { + Msr::IA32_XSS.read() + } else { + 0 + }; + Self { - xcr0: unsafe { xcr0_read() }, - xss: Msr::IA32_XSS.read(), + host_xcr0: xcr0, + guest_xcr0: xcr0, + host_xss: xss, + guest_xss: xss, + xsave_available, + xsaves_available, } } - pub fn save(&mut self) { - self.xcr0 = unsafe { xcr0_read() }; - self.xss = Msr::IA32_XSS.read(); - warn!("XState::save: xcr0: {:?}, xss: {:#x}", self.xcr0, self.xss); + /// Enable extended processor state management instructions, including XGETBV and XSAVE. + pub fn enable_xsave() { + if Self::xsave_available() { + unsafe { Cr4::write(Cr4::read() | Cr4Flags::OSXSAVE) }; + } } - pub fn restore(&self) { - warn!( - "XState::restore: xcr0: {:?}, xss: {:#x}", - self.xcr0, self.xss - ); + /// Check if XSAVE is available on the current CPU. + pub fn xsave_available() -> bool { + let cpuid = CpuId::new(); + cpuid + .get_feature_info() + .map(|f| f.has_xsave()) + .unwrap_or(false) + } + + /// Check if XSAVES and XRSTORS (as well as IA32_XSS) are available on the current CPU. + pub fn xsaves_available() -> bool { + let cpuid = CpuId::new(); + cpuid + .get_extended_state_info() + .map(|f| f.has_xsaves_xrstors()) + .unwrap_or(false) + } + + /// Save the current host XCR0 and IA32_XSS values and load the guest values. + #[allow(unused)] + pub fn switch_to_guest(&mut self) { unsafe { - xcr0_write(self.xcr0); - Msr::IA32_XSS.write(self.xss); + if self.xsave_available { + self.host_xcr0 = xcr0_read().bits(); + xcr0_write(Xcr0::from_bits_unchecked(self.guest_xcr0)); + + if self.xsaves_available { + self.host_xss = Msr::IA32_XSS.read(); + Msr::IA32_XSS.write(self.guest_xss); + } + } } } - /// Enables extended processor state management instructions, including XGETBV and XSAVE. - pub fn enable_xsave() { - unsafe { Cr4::write(Cr4::read() | Cr4Flags::OSXSAVE) }; + /// Save the current guest XCR0 and IA32_XSS values and load the host values. + #[allow(unused)] + pub fn switch_to_host(&mut self) { + unsafe { + if self.xsave_available { + self.guest_xcr0 = xcr0_read().bits(); + xcr0_write(Xcr0::from_bits_unchecked(self.host_xcr0)); + + if self.xsaves_available { + self.guest_xss = Msr::IA32_XSS.read(); + Msr::IA32_XSS.write(self.host_xss); + } + } + } } }