diff --git a/Cargo.toml b/Cargo.toml index a13a65f..aa9c9d2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,9 +12,18 @@ x86 = "0.52" x86_64 = "0.15" raw-cpuid = "11.0" numeric-enum-macro = "0.2" +iced-x86 = { version = "1.12.0", features = [ + "decoder", + "no_std", + "intel", + "op_code_info", + "encoder", + "masm" +], default-features = false } axerrno = "0.1.0" page_table_entry = "0.5" +page_table_multiarch = "0.5" memory_addr = "0.3.1" crate_interface = "0.1" diff --git a/src/context.rs b/src/context.rs new file mode 100644 index 0000000..f72ec9a --- /dev/null +++ b/src/context.rs @@ -0,0 +1,333 @@ +use x86::segmentation::SegmentSelector; +use x86::{Ring, segmentation, task}; +use x86_64::VirtAddr; +use x86_64::instructions::tables::{lgdt, lidt, sidt}; +use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr3Flags, Cr4, Cr4Flags, Efer, EferFlags}; +use x86_64::structures::DescriptorTablePointer; +use x86_64::{addr::PhysAddr, structures::paging::PhysFrame}; + +use crate::msr::Msr; +use crate::regs::GeneralRegisters; +use crate::segmentation::{Segment, SegmentAccessRights}; + +const SAVED_LINUX_REGS: usize = 8; + +#[derive(Debug, Clone, Copy)] +pub struct LinuxContext { + pub rsp: u64, + pub rip: u64, + + pub r15: u64, + pub r14: u64, + pub r13: u64, + pub r12: u64, + pub rbx: u64, + pub rbp: u64, + + pub es: Segment, + pub cs: Segment, + pub ss: Segment, + pub ds: Segment, + pub fs: Segment, + pub gs: Segment, + pub tss: Segment, + pub gdt: DescriptorTablePointer, + pub idt: DescriptorTablePointer, + + pub cr0: Cr0Flags, + pub cr3: u64, + pub cr4: Cr4Flags, + + pub efer: EferFlags, + pub star: u64, + pub lstar: u64, + pub cstar: u64, + pub fmask: u64, + + pub ia32_sysenter_cs: u64, + pub ia32_sysenter_esp: u64, + pub ia32_sysenter_eip: u64, + + pub kernel_gsbase: u64, + pub pat: u64, + pub mtrr_def_type: u64, + // TODO: xstate +} + +unsafe impl Send for LinuxContext {} +unsafe impl Sync for LinuxContext {} + +impl Default for LinuxContext { + fn default() -> Self { + Self { + rsp: 0, + rip: 0, + r15: 0, + r14: 0, + r13: 0, + r12: 0, + rbx: 0, + rbp: 0, + es: Segment::invalid(), + cs: Segment::invalid(), + ss: Segment::invalid(), + ds: Segment::invalid(), + fs: Segment::invalid(), + gs: Segment::invalid(), + tss: Segment::invalid(), + gdt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + idt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + cr0: Cr0Flags::empty(), + cr3: 0, + cr4: Cr4Flags::empty(), + efer: EferFlags::empty(), + star: 0, + lstar: 0, + cstar: 0, + fmask: 0, + ia32_sysenter_cs: 0, + ia32_sysenter_esp: 0, + ia32_sysenter_eip: 0, + kernel_gsbase: 0, + pat: 0, + mtrr_def_type: 0, + } + } +} + +fn sgdt() -> DescriptorTablePointer { + let mut gdt = DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }; + unsafe { + core::arch::asm!("sgdt [{0}]", in(reg) &mut gdt, options(nostack, preserves_flags)); + } + gdt +} + +impl LinuxContext { + /// Load linux callee-saved registers from the stack, and other system registers. + pub fn load_from(linux_sp: usize) -> Self { + let regs = unsafe { core::slice::from_raw_parts(linux_sp as *const u64, SAVED_LINUX_REGS) }; + let gdt = sgdt(); + + let mut fs = Segment::from_selector(x86::segmentation::fs(), &gdt); + let mut gs = Segment::from_selector(x86::segmentation::gs(), &gdt); + fs.base = Msr::IA32_FS_BASE.read(); + gs.base = regs[0]; + + Self { + rsp: regs.as_ptr_range().end as _, + r15: regs[1], + r14: regs[2], + r13: regs[3], + r12: regs[4], + rbx: regs[5], + rbp: regs[6], + rip: regs[7], + es: Segment::from_selector(segmentation::es(), &gdt), + cs: Segment::from_selector(segmentation::cs(), &gdt), + ss: Segment::from_selector(segmentation::ss(), &gdt), + ds: Segment::from_selector(segmentation::ds(), &gdt), + fs, + gs, + tss: Segment::from_selector(unsafe { task::tr() }, &gdt), + gdt, + idt: sidt(), + cr0: Cr0::read(), + cr3: Cr3::read().0.start_address().as_u64(), + cr4: Cr4::read(), + efer: Efer::read(), + star: Msr::IA32_STAR.read(), + lstar: Msr::IA32_LSTAR.read(), + cstar: Msr::IA32_CSTAR.read(), + fmask: Msr::IA32_FMASK.read(), + ia32_sysenter_cs: Msr::IA32_SYSENTER_CS.read(), + ia32_sysenter_esp: Msr::IA32_SYSENTER_ESP.read(), + ia32_sysenter_eip: Msr::IA32_SYSENTER_EIP.read(), + kernel_gsbase: Msr::IA32_KERNEL_GSBASE.read(), + pat: Msr::IA32_PAT.read(), + mtrr_def_type: Msr::IA32_MTRR_DEF_TYPE.read(), + } + } + + pub fn construct_guest64(rip: u64, cr3: u64) -> Self { + Self { + rsp: 0, + rip, + r15: 0, + r14: 0, + r13: 0, + r12: 0, + rbx: 0, + rbp: 0, + es: Segment::invalid(), + cs: Segment { + selector: SegmentSelector::new(1, Ring::Ring0), + base: 0, + limit: 0xffff, + access_rights: SegmentAccessRights::ACCESSED + | SegmentAccessRights::WRITABLE + | SegmentAccessRights::EXECUTABLE + | SegmentAccessRights::CODE_DATA + | SegmentAccessRights::PRESENT + | SegmentAccessRights::LONG_MODE + | SegmentAccessRights::GRANULARITY, + }, + ss: Segment { + selector: SegmentSelector::new(2, Ring::Ring0), + base: 0, + limit: 0xffff, + access_rights: SegmentAccessRights::ACCESSED + | SegmentAccessRights::WRITABLE + | SegmentAccessRights::CODE_DATA + | SegmentAccessRights::PRESENT + | SegmentAccessRights::DB + | SegmentAccessRights::GRANULARITY, + }, + ds: Segment::invalid(), + fs: Segment::invalid(), + gs: Segment::invalid(), + tss: Segment { + selector: SegmentSelector::new(2, Ring::Ring0), + base: 0, + limit: 0, + access_rights: SegmentAccessRights::ACCESSED + | SegmentAccessRights::WRITABLE + | SegmentAccessRights::EXECUTABLE + | SegmentAccessRights::PRESENT, + }, + gdt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + idt: DescriptorTablePointer { + limit: 0, + base: VirtAddr::zero(), + }, + cr0: Cr0Flags::PROTECTED_MODE_ENABLE + | Cr0Flags::MONITOR_COPROCESSOR + | Cr0Flags::EXTENSION_TYPE + | Cr0Flags::NUMERIC_ERROR + | Cr0Flags::WRITE_PROTECT + | Cr0Flags::ALIGNMENT_MASK + | Cr0Flags::PAGING, + cr3, + cr4: Cr4Flags::PHYSICAL_ADDRESS_EXTENSION | Cr4Flags::PAGE_GLOBAL, + efer: EferFlags::LONG_MODE_ENABLE + | EferFlags::LONG_MODE_ACTIVE + | EferFlags::NO_EXECUTE_ENABLE + | EferFlags::SYSTEM_CALL_EXTENSIONS, + star: 0, + lstar: 0, + cstar: 0, + fmask: 0, + ia32_sysenter_cs: 0, + ia32_sysenter_esp: 0, + ia32_sysenter_eip: 0, + kernel_gsbase: 0, + pat: 0, + mtrr_def_type: 0, + } + } + + /// Restore system registers. + pub fn restore(&self) { + unsafe { + Msr::IA32_SYSENTER_CS.write(self.ia32_sysenter_cs); + Msr::IA32_SYSENTER_ESP.write(self.ia32_sysenter_esp); + Msr::IA32_SYSENTER_EIP.write(self.ia32_sysenter_eip); + + Efer::write(self.efer); + Msr::IA32_STAR.write(self.star); + Msr::IA32_LSTAR.write(self.lstar); + Msr::IA32_CSTAR.write(self.cstar); + Msr::IA32_FMASK.write(self.fmask); + Msr::IA32_KERNEL_GSBASE.write(self.kernel_gsbase); + Msr::IA32_PAT.write(self.pat); + + Cr0::write(self.cr0); + Cr4::write(self.cr4); + // cr3 must be last in case cr4 enables PCID + Cr3::write( + PhysFrame::containing_address(PhysAddr::new(self.cr3)), + Cr3Flags::empty(), // clear PCID + ); + } + + // Copy Linux TSS descriptor into our GDT, clearing the busy flag, + // then reload TR from it. We can't use Linux' GDT as it is r/o. + + let hv_gdt = sgdt(); + let entry_count = (hv_gdt.limit as usize + 1) / size_of::(); + + let hv_gdt_table: &mut [u64] = + unsafe { core::slice::from_raw_parts_mut(hv_gdt.base.as_mut_ptr(), entry_count) }; + + let linux_gdt = &self.gdt; + let entry_count = (linux_gdt.limit as usize + 1) / size_of::(); + let linux_gdt_table = + unsafe { core::slice::from_raw_parts(linux_gdt.base.as_mut_ptr(), entry_count) }; + + let tss_idx = self.tss.selector.index() as usize; + hv_gdt_table[tss_idx] = linux_gdt_table[tss_idx]; + hv_gdt_table[tss_idx + 1] = linux_gdt_table[tss_idx + 1]; + + SegmentAccessRights::set_descriptor_type( + &mut hv_gdt_table[self.tss.selector.index() as usize], + SegmentAccessRights::TSS_AVAIL, + ); + unsafe { + task::load_tr(self.tss.selector); + lgdt(&self.gdt); + lidt(&self.idt); + + segmentation::load_es(self.es.selector); + segmentation::load_cs(self.cs.selector); + segmentation::load_ss(self.ss.selector); + segmentation::load_ds(self.ds.selector); + segmentation::load_fs(self.fs.selector); + segmentation::load_gs(self.gs.selector); + + Msr::IA32_FS_BASE.write(self.fs.base); + } + } + + pub fn load_guest_regs(&mut self, regs: &GeneralRegisters) { + self.r15 = regs.r15; + self.r14 = regs.r14; + self.r13 = regs.r13; + self.r12 = regs.r12; + self.rbx = regs.rbx; + self.rbp = regs.rbp; + } + + /// Restore linux general-purpose registers and stack, then return back to linux. + pub fn return_to_linux(&self, guest_regs: &GeneralRegisters) -> ! { + unsafe { + Msr::IA32_GS_BASE.write(self.gs.base); + core::arch::asm!( + "mov rsp, {linux_rsp}", + "push {linux_rip}", + "mov rcx, rsp", + "mov rsp, {guest_regs}", + "mov [rsp + {guest_regs_size}], rcx", + restore_regs_from_stack!(), + "pop rsp", + "ret", + linux_rsp = in(reg) self.rsp, + linux_rip = in(reg) self.rip, + guest_regs = in(reg) guest_regs, + guest_regs_size = const core::mem::size_of::(), + options(noreturn), + ); + } + } +} diff --git a/src/ept.rs b/src/ept.rs index 74549b3..8b13789 100644 --- a/src/ept.rs +++ b/src/ept.rs @@ -1,27 +1 @@ -#[derive(Debug)] -/// The information of guest page walk. -pub struct GuestPageWalkInfo { - /// The guest page table physical address. - pub top_entry: usize, // Top level paging structure entry - /// Guest page table level. - pub level: usize, - /// Guest page table width - pub width: u32, - /// Guest page table user mode - pub is_user_mode_access: bool, - /// Guest page table write access - pub is_write_access: bool, - /// Guest page table instruction fetch - pub is_inst_fetch: bool, - /// CR4.PSE for 32bit paging, true for PAE/4-level paging - pub pse: bool, - /// CR0.WP - pub wp: bool, // CR0.WP - /// MSR_IA32_EFER_NXE_BIT - pub nxe: bool, - /// Guest page table Supervisor mode access prevention - pub is_smap_on: bool, - /// Guest page table Supervisor mode execution protection - pub is_smep_on: bool, -} diff --git a/src/frame.rs b/src/frame.rs index 8bb37c2..05525cf 100644 --- a/src/frame.rs +++ b/src/frame.rs @@ -1,5 +1,7 @@ use core::marker::PhantomData; +use page_table_multiarch::PagingHandler; + use axaddrspace::HostPhysAddr; use axerrno::{AxResult, ax_err_type}; @@ -17,7 +19,7 @@ pub struct PhysFrame { impl PhysFrame { pub fn alloc() -> AxResult { - let start_paddr = H::alloc_frame() + let start_paddr = H::PagingHandler::alloc_frame() .ok_or_else(|| ax_err_type!(NoMemory, "allocate physical frame failed"))?; assert_ne!(start_paddr.as_usize(), 0); Ok(Self { @@ -44,7 +46,7 @@ impl PhysFrame { } pub fn as_mut_ptr(&self) -> *mut u8 { - H::phys_to_virt(self.start_paddr()).as_mut_ptr() + H::PagingHandler::phys_to_virt(self.start_paddr()).as_mut_ptr() } pub fn fill(&mut self, byte: u8) { @@ -55,7 +57,7 @@ impl PhysFrame { impl Drop for PhysFrame { fn drop(&mut self) { if let Some(start_paddr) = self.start_paddr { - H::dealloc_frame(start_paddr); + H::PagingHandler::dealloc_frame(start_paddr); debug!("[AxVM] deallocated PhysFrame({:#x})", start_paddr); } } diff --git a/src/lib.rs b/src/lib.rs index faab4b7..d7f9945 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,12 @@ pub(crate) mod msr; pub(crate) mod regs; mod ept; mod frame; +mod page_table; + +mod context; +mod segmentation; +// mod tables; +mod xstate; cfg_if::cfg_if! { if #[cfg(feature = "vmx")] { @@ -26,6 +32,6 @@ cfg_if::cfg_if! { } } -pub use ept::GuestPageWalkInfo; +pub use context::LinuxContext; pub use regs::GeneralRegisters; pub use vender::has_hardware_support; diff --git a/src/msr.rs b/src/msr.rs index 09e4463..ac105e4 100644 --- a/src/msr.rs +++ b/src/msr.rs @@ -7,7 +7,12 @@ use x86::msr::{rdmsr, wrmsr}; pub enum Msr { IA32_FEATURE_CONTROL = 0x3a, + IA32_SYSENTER_CS = 0x174, + IA32_SYSENTER_ESP = 0x175, + IA32_SYSENTER_EIP = 0x176, + IA32_PAT = 0x277, + IA32_MTRR_DEF_TYPE = 0x2ff, IA32_VMX_BASIC = 0x480, IA32_VMX_PINBASED_CTLS = 0x481, diff --git a/src/page_table.rs b/src/page_table.rs new file mode 100644 index 0000000..4263c87 --- /dev/null +++ b/src/page_table.rs @@ -0,0 +1,197 @@ +//! Used to query and manipulate the page tables of a guest. +use core::marker::PhantomData; + +use memory_addr::MemoryAddr; +use page_table_entry::{GenericPTE, MappingFlags}; +use page_table_multiarch::{PageSize, PagingError, PagingHandler, PagingResult}; + +use axaddrspace::{EPTTranslator, GuestPhysAddr, GuestVirtAddr}; + +const fn p5_index(vaddr: usize) -> usize { + (vaddr >> (12 + 36)) & (ENTRY_COUNT - 1) +} + +const fn p4_index(vaddr: usize) -> usize { + (vaddr >> (12 + 27)) & (ENTRY_COUNT - 1) +} + +const fn p3_index(vaddr: usize) -> usize { + (vaddr >> (12 + 18)) & (ENTRY_COUNT - 1) +} + +const fn p2_index(vaddr: usize) -> usize { + (vaddr >> (12 + 9)) & (ENTRY_COUNT - 1) +} + +const fn p1_index(vaddr: usize) -> usize { + (vaddr >> 12) & (ENTRY_COUNT - 1) +} + +#[derive(Debug)] +/// The information of guest page walk. +pub struct GuestPageWalkInfo { + /// Guest VM cr3 value. + pub cr3: usize, + /// Guest page table level. + pub level: usize, + /// Guest page table width + pub width: u32, + /// Guest page table user mode + pub is_user_mode_access: bool, + /// Guest page table write access + pub is_write_access: bool, + /// Guest page table instruction fetch + pub is_inst_fetch: bool, + /// CR4.PSE for 32bit paging, true for PAE/4-level paging + pub pse: bool, + /// CR0.WP + pub wp: bool, // CR0.WP + /// MSR_IA32_EFER_NXE_BIT + pub nxe: bool, + + /// Guest page table Supervisor mode access prevention + pub is_smap_on: bool, + /// Guest page table Supervisor mode execution protection + pub is_smep_on: bool, +} + +// /// Metadata of guest page tables. +// pub struct GuestPageTableMetadata; + +// impl PagingMetaData for GuestPageTableMetadata { +// const LEVELS: usize = 4; +// const PA_MAX_BITS: usize = 52; +// const VA_MAX_BITS: usize = 48; + +// type VirtAddr = GuestVirtAddr; +// type PhysAddr = GuestPhysAddr; + +// fn to_actual_paddr(paddr: Self::PhysAddr) -> HostPhysAddr { +// EPT::guest_phys_to_host_phys(paddr).unwrap() +// } + +// fn flush_tlb(_vaddr: Option) { +// warn!("flush_tlb is not implemented for guest page tables"); +// } +// } + +const ENTRY_COUNT: usize = 512; + +// pub type GuestPageTable = PageTable64, X64PTE, H>; + +/// A generic page table struct for 64-bit platform. +/// +/// It also tracks all intermediate level tables. They will be deallocated +/// When the [`GuestPageTable64`] itself is dropped. +pub struct GuestPageTable64 { + root_paddr: GuestPhysAddr, + levels: usize, + _phantom: PhantomData<(PTE, H, EPT)>, +} + +impl GuestPageTable64 { + /// Create a new page table. + pub fn construct(guest_ptw_info: &GuestPageWalkInfo) -> Self { + const PHYS_ADDR_MASK: usize = 0x000f_ffff_ffff_f000; // bits 12..52 + + Self { + root_paddr: GuestPhysAddr::from(guest_ptw_info.cr3 & &PHYS_ADDR_MASK), + levels: guest_ptw_info.level, + _phantom: PhantomData, + } + } + + /// Get the root page table physical address. + pub fn root_paddr(&self) -> GuestPhysAddr { + self.root_paddr + } + + /// Queries the result of the mapping starts with `vaddr`. + /// + /// Returns the physical address of the target frame, mapping flags, and + /// the page size. + /// + /// Returns [`Err(PagingError::NotMapped)`](PagingError::NotMapped) if the + /// mapping is not present. + pub fn query( + &self, + vaddr: GuestVirtAddr, + ) -> PagingResult<(GuestPhysAddr, MappingFlags, PageSize)> { + let (entry, size) = self.get_entry(vaddr)?; + if entry.is_unused() { + error!("GuestPT64 query {:?} Entry is unused", vaddr); + return Err(PagingError::NotMapped); + } + let off = size.align_offset(vaddr.into()); + Ok((entry.paddr().add(off).into(), entry.flags(), size)) + } +} + +// private implements +impl GuestPageTable64 { + fn table_of<'a>(&self, gpa: GuestPhysAddr) -> PagingResult<&'a [PTE]> { + let hpa = EPT::guest_phys_to_host_phys(gpa) + .map(|(hpa, _flags, _pgsize)| hpa) + .ok_or_else(|| { + warn!("Failed to translate GPA {:?}", gpa); + PagingError::NotMapped + })?; + let ptr = H::phys_to_virt(hpa).as_ptr() as _; + + Ok(unsafe { core::slice::from_raw_parts(ptr, ENTRY_COUNT) }) + } + + fn next_table<'a>(&self, entry: &PTE) -> PagingResult<&'a [PTE]> { + if !entry.is_present() { + error!("GuestPT64 next_table {:?} Entry is not present", entry); + Err(PagingError::NotMapped) + } else if entry.is_huge() { + error!("GuestPT64 next_table {:?} Entry is huge", entry); + Err(PagingError::MappedToHugePage) + } else { + self.table_of(entry.paddr().into()) + } + } + + fn get_entry(&self, gva: GuestVirtAddr) -> PagingResult<(&PTE, PageSize)> { + let vaddr: usize = gva.into(); + + let p3 = if self.levels == 3 { + self.table_of(self.root_paddr())? + } else if self.levels == 4 { + let p4 = self.table_of(self.root_paddr())?; + let p4e = &p4[p4_index(vaddr)]; + self.next_table(p4e)? + } else { + // 5-level paging + let p5 = self.table_of(self.root_paddr())?; + let p5e = &p5[p5_index(vaddr)]; + if p5e.is_huge() { + return Err(PagingError::MappedToHugePage); + } + let p4 = self.next_table(p5e)?; + let p4e = &p4[p4_index(vaddr)]; + + if p4e.is_huge() { + return Err(PagingError::MappedToHugePage); + } + + self.next_table(p4e)? + }; + + let p3e = &p3[p3_index(vaddr)]; + if p3e.is_huge() { + return Ok((p3e, PageSize::Size1G)); + } + + let p2 = self.next_table(p3e)?; + let p2e = &p2[p2_index(vaddr)]; + if p2e.is_huge() { + return Ok((p2e, PageSize::Size2M)); + } + + let p1 = self.next_table(p2e)?; + let p1e = &p1[p1_index(vaddr)]; + Ok((p1e, PageSize::Size4K)) + } +} diff --git a/src/regs.rs b/src/regs.rs index 2039c77..1cd0869 100644 --- a/src/regs.rs +++ b/src/regs.rs @@ -40,6 +40,28 @@ pub struct GeneralRegisters { } impl GeneralRegisters { + pub fn from_context(context: &crate::context::LinuxContext) -> Self { + Self { + rax: 0, + rbx: context.rbx, + rbp: context.rbp, + r12: context.r12, + r13: context.r13, + r14: context.r14, + r15: context.r15, + ..Default::default() + } + } + + pub fn load_from_context(&mut self, context: &crate::context::LinuxContext) { + self.rbx = context.rbx; + self.rbp = context.rbp; + self.r12 = context.r12; + self.r13 = context.r13; + self.r14 = context.r14; + self.r15 = context.r15; + } + /// Returns the value of the general-purpose register corresponding to the given index. /// /// The mapping of indices to registers is as follows: @@ -154,6 +176,7 @@ impl GeneralRegisters { macro_rules! save_regs_to_stack { () => { " + .code64 push r15 push r14 push r13 diff --git a/src/segmentation.rs b/src/segmentation.rs new file mode 100644 index 0000000..4853cf2 --- /dev/null +++ b/src/segmentation.rs @@ -0,0 +1,120 @@ +use bit_field::BitField; +use bitflags::bitflags; +// use x86::dtables::DescriptorTablePointer; +use x86::segmentation::SegmentSelector; +use x86_64::structures::DescriptorTablePointer; +use x86_64::structures::gdt::DescriptorFlags; + +bitflags! { + /// Access rights for VMCS guest register states. + /// + /// The low 16 bits correspond to bits 23:8 of the upper 32 bits of a 64-bit + /// segment descriptor. See Volume 3, Section 24.4.1 for access rights format, + /// Volume 3, Section 3.4.5.1 for valid non-system selector types, Volume 3, + /// Section 3.5 for valid system selectors types. + #[derive(Debug, Clone, Copy)] + pub struct SegmentAccessRights: u32 { + /// Accessed flag. + const ACCESSED = 1 << 0; + /// For data segments, this flag sets the segment as writable. For code + /// segments, this flag sets the segment as readable. + const WRITABLE = 1 << 1; + /// For data segments, this flag marks a data segment as “expansion-direction”. + /// For code segments, this flag marks a code segment as “conforming”. + const CONFORMING = 1 << 2; + /// This flag must be set for code segments. + const EXECUTABLE = 1 << 3; + /// S — Descriptor type (0 = system; 1 = code or data) + const CODE_DATA = 1 << 4; + /// P — Segment present + const PRESENT = 1 << 7; + /// L - Reserved (except for CS) or 64-bit mode active (for CS only) + const LONG_MODE = 1 << 13; + /// D/B — Default operation size (0 = 16-bit segment; 1 = 32-bit segment) + const DB = 1 << 14; + /// G — Granularity + const GRANULARITY = 1 << 15; + /// Segment unusable (0 = usable; 1 = unusable) + const UNUSABLE = 1 << 16; + + /// TSS (Available) for 32/64-bit + const TSS_AVAIL = 0b1001; + /// TSS (Busy) for 32/64-bit + const TSS_BUSY = 0b1011; + + /// Descriptor privilege level (User) + const DPL_USER = 3 << 5; + } +} + +impl SegmentAccessRights { + #[allow(dead_code)] + pub fn dpl(&self) -> u8 { + self.bits().get_bits(5..=6) as _ + } + + pub fn from_descriptor(desc: u64) -> Self { + Self::from_bits_truncate(desc.get_bits(40..56) as u32 & 0xf0ff) + } + + pub fn _type_field(&self) -> Self { + Self::from_bits_truncate(self.bits() & 0xf) + } + + pub fn set_descriptor_type(desc: &mut u64, type_field: Self) { + desc.set_bits(40..44, type_field.bits() as u64); + } + + #[cfg(feature = "amd")] + pub fn as_svm_segment_attributes(&self) -> u16 { + let bits = self.bits() as u16; + (bits & 0xff) | ((bits & 0xf000) >> 4) + } +} + +#[derive(Debug, Clone, Copy)] +pub struct Segment { + pub selector: SegmentSelector, + pub base: u64, + pub limit: u32, + pub access_rights: SegmentAccessRights, +} + +impl Segment { + pub const fn invalid() -> Self { + Self { + selector: SegmentSelector::empty(), + base: 0, + limit: 0, + access_rights: SegmentAccessRights::UNUSABLE, + } + } + + pub fn from_selector(selector: SegmentSelector, gdt: &DescriptorTablePointer) -> Self { + let index = selector.index() as usize; + let entry_count = (gdt.limit as usize + 1) / size_of::(); + let table = unsafe { core::slice::from_raw_parts(gdt.base.as_mut_ptr(), entry_count) }; + + let entry_value = table[index]; + let entry = DescriptorFlags::from_bits_truncate(entry_value); + if entry.contains(DescriptorFlags::PRESENT) { + let mut base = entry_value.get_bits(16..40) | entry_value.get_bits(56..64) << 24; + let mut limit = entry_value.get_bits(0..16) | entry_value.get_bits(48..52) << 16; + if !entry.contains(DescriptorFlags::USER_SEGMENT) { + let high = table[index + 1]; + base += high << 32; + } + if entry.contains(DescriptorFlags::GRANULARITY) { + limit = (limit << 12) | 0xfff; + } + Self { + selector, + base, + limit: limit as _, + access_rights: SegmentAccessRights::from_descriptor(entry_value), + } + } else { + Self::invalid() + } + } +} diff --git a/src/vmx/percpu.rs b/src/vmx/percpu.rs index 6e1e743..065eb03 100644 --- a/src/vmx/percpu.rs +++ b/src/vmx/percpu.rs @@ -8,6 +8,7 @@ use memory_addr::PAGE_SIZE_4K as PAGE_SIZE; use crate::msr::Msr; use crate::vmx::has_hardware_support; use crate::vmx::structs::{FeatureControl, FeatureControlFlags, VmxBasic, VmxRegion}; +use crate::xstate::XState; /// Represents the per-CPU state for Virtual Machine Extensions (VMX). /// @@ -49,7 +50,7 @@ impl AxArchPerCpu for VmxPerCpuState { } // Enable XSAVE/XRSTOR. - super::vcpu::XState::enable_xsave(); + XState::enable_xsave(); // Enable VMXON, if required. let ctrl = FeatureControl::read(); @@ -100,9 +101,26 @@ impl AxArchPerCpu for VmxPerCpuState { self.vmcs_revision_id = vmx_basic.revision_id; self.vmx_region = VmxRegion::new(self.vmcs_revision_id, false)?; + use x86_64::registers::control::{Cr0Flags, Cr4Flags}; + const HOST_CR0: Cr0Flags = Cr0Flags::from_bits_truncate( + Cr0Flags::PAGING.bits() + | Cr0Flags::WRITE_PROTECT.bits() + | Cr0Flags::NUMERIC_ERROR.bits() + | Cr0Flags::TASK_SWITCHED.bits() + | Cr0Flags::MONITOR_COPROCESSOR.bits() + | Cr0Flags::PROTECTED_MODE_ENABLE.bits(), + ); + const HOST_CR4: Cr4Flags = Cr4Flags::from_bits_truncate( + Cr4Flags::PHYSICAL_ADDRESS_EXTENSION.bits() + | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS.bits() + | Cr4Flags::OSXSAVE.bits(), + ); + unsafe { // Enable VMX using the VMXE bit. - Cr4::write(Cr4::read() | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS); + // Cr4::write(Cr4::read() | Cr4Flags::VIRTUAL_MACHINE_EXTENSIONS); + Cr0::write(HOST_CR0); + Cr4::write(HOST_CR4); // Execute VMXON. vmx::vmxon(self.vmx_region.phys_addr().as_usize() as _).map_err(|err| { ax_err_type!( diff --git a/src/vmx/structs.rs b/src/vmx/structs.rs index fa3f698..1056ed3 100644 --- a/src/vmx/structs.rs +++ b/src/vmx/structs.rs @@ -256,6 +256,8 @@ bitflags! { const WALK_LENGTH_3 = 2 << 3; /// EPT page-walk length 4. const WALK_LENGTH_4 = 3 << 3; + /// EPT page-walk length 5 + const WALK_LENGTH_5 = 4 << 3; /// Setting this control to 1 enables accessed and dirty flags for EPT. const ENABLE_ACCESSED_DIRTY = 1 << 6; } @@ -268,3 +270,22 @@ impl EPTPointer { flags | Self::MEM_TYPE_WB | Self::WALK_LENGTH_4 | Self::ENABLE_ACCESSED_DIRTY } } + +/// EPTP list, the 4-KByte structure, +/// The EPTP list comprises 512 8-Byte entries (each an EPTP value) +/// and is used by the EPTP-switching VM function (see Section 26.5.6.3). +pub(super) struct EptpList { + frame: PhysFrame, +} + +impl EptpList { + pub fn new() -> AxResult { + Ok(Self { + frame: PhysFrame::alloc_zero()?, + }) + } + + pub fn phys_addr(&self) -> HostPhysAddr { + self.frame.start_paddr() + } +} diff --git a/src/vmx/vcpu.rs b/src/vmx/vcpu.rs index dfb1b3e..cd49a4c 100644 --- a/src/vmx/vcpu.rs +++ b/src/vmx/vcpu.rs @@ -1,43 +1,47 @@ use alloc::collections::VecDeque; -use bit_field::BitField; +use alloc::vec::Vec; use core::fmt::{Debug, Formatter, Result}; use core::{arch::naked_asm, mem::size_of}; + +use bit_field::BitField; use raw_cpuid::CpuId; use x86::bits64::vmx; -use x86::controlregs::{Xcr0, xcr0 as xcr0_read, xcr0_write}; +use x86::controlregs::Xcr0; use x86::dtables::{self, DescriptorTablePointer}; use x86::segmentation::SegmentSelector; +use x86_64::VirtAddr; use x86_64::registers::control::{Cr0, Cr0Flags, Cr3, Cr4, Cr4Flags, EferFlags}; -use axaddrspace::{GuestPhysAddr, GuestVirtAddr, HostPhysAddr, NestedPageFaultInfo}; +use page_table_entry::x86_64::X64PTE; +use page_table_multiarch::{PageSize, PagingHandler, PagingResult}; + +use axaddrspace::EPTTranslator; +use axaddrspace::{GuestPhysAddr, GuestVirtAddr, HostPhysAddr, MappingFlags, NestedPageFaultInfo}; use axerrno::{AxResult, ax_err, ax_err_type}; -use axvcpu::{AccessWidth, AxArchVCpu, AxVCpuExitReason, AxVCpuHal}; +use axvcpu::{AccessWidth, AxArchVCpu, AxVCpuExitReason, AxVCpuHal, AxVcpuAccessGuestState}; use super::VmxExitInfo; use super::as_axerr; use super::definitions::VmxExitReason; -use super::structs::{IOBitmap, MsrBitmap, VmxRegion}; +use super::read_vmcs_revision_id; +use super::structs::{EptpList, IOBitmap, MsrBitmap, VmxRegion}; use super::vmcs::{ self, VmcsControl32, VmcsControl64, VmcsControlNW, VmcsGuest16, VmcsGuest32, VmcsGuest64, - VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, + VmcsGuestNW, VmcsHost16, VmcsHost32, VmcsHost64, VmcsHostNW, exit_qualification, + interrupt_exit_info, }; -use crate::{ept::GuestPageWalkInfo, msr::Msr, regs::GeneralRegisters}; +use crate::LinuxContext; +use crate::page_table::GuestPageTable64; +use crate::page_table::GuestPageWalkInfo; +use crate::segmentation::{Segment, SegmentAccessRights}; +use crate::xstate::XState; +use crate::{msr::Msr, regs::GeneralRegisters}; const VMX_PREEMPTION_TIMER_SET_VALUE: u32 = 1_000_000; const QEMU_EXIT_PORT: u16 = 0x604; const QEMU_EXIT_MAGIC: u64 = 0x2000; -pub struct XState { - host_xcr0: u64, - guest_xcr0: u64, - host_xss: u64, - guest_xss: u64, - - xsave_available: bool, - xsaves_available: bool, -} - #[derive(PartialEq, Eq, Debug)] pub enum VmCpuMode { Real, @@ -46,97 +50,6 @@ pub enum VmCpuMode { Mode64, // IA-32E mode (CS.L = 1) } -impl XState { - /// Create a new [`XState`] instance with current host state - fn new() -> Self { - // Check if XSAVE is available - let xsave_available = Self::xsave_available(); - // Check if XSAVES and XRSTORS (as well as IA32_XSS) are available - let xsaves_available = if xsave_available { - Self::xsaves_available() - } else { - false - }; - - // Read XCR0 iff XSAVE is available - let xcr0 = if xsave_available { - unsafe { xcr0_read().bits() } - } else { - 0 - }; - // Read IA32_XSS iff XSAVES is available - let xss = if xsaves_available { - Msr::IA32_XSS.read() - } else { - 0 - }; - - Self { - host_xcr0: xcr0, - guest_xcr0: xcr0, - host_xss: xss, - guest_xss: xss, - xsave_available, - xsaves_available, - } - } - - /// Enable extended processor state management instructions, including XGETBV and XSAVE. - pub fn enable_xsave() { - if Self::xsave_available() { - unsafe { Cr4::write(Cr4::read() | Cr4Flags::OSXSAVE) }; - } - } - - /// Check if XSAVE is available on the current CPU. - pub fn xsave_available() -> bool { - let cpuid = CpuId::new(); - cpuid - .get_feature_info() - .map(|f| f.has_xsave()) - .unwrap_or(false) - } - - /// Check if XSAVES and XRSTORS (as well as IA32_XSS) are available on the current CPU. - pub fn xsaves_available() -> bool { - let cpuid = CpuId::new(); - cpuid - .get_extended_state_info() - .map(|f| f.has_xsaves_xrstors()) - .unwrap_or(false) - } - - /// Save the current host XCR0 and IA32_XSS values and load the guest values. - pub fn switch_to_guest(&mut self) { - unsafe { - if self.xsave_available { - self.host_xcr0 = xcr0_read().bits(); - xcr0_write(Xcr0::from_bits_unchecked(self.guest_xcr0)); - - if self.xsaves_available { - self.host_xss = Msr::IA32_XSS.read(); - Msr::IA32_XSS.write(self.guest_xss); - } - } - } - } - - /// Save the current guest XCR0 and IA32_XSS values and load the host values. - pub fn switch_to_host(&mut self) { - unsafe { - if self.xsave_available { - self.guest_xcr0 = xcr0_read().bits(); - xcr0_write(Xcr0::from_bits_unchecked(self.host_xcr0)); - - if self.xsaves_available { - self.guest_xss = Msr::IA32_XSS.read(); - Msr::IA32_XSS.write(self.host_xss); - } - } - } - } -} - const MSR_IA32_EFER_LMA_BIT: u64 = 1 << 10; const CR0_PE: usize = 1 << 0; @@ -151,40 +64,38 @@ pub struct VmxVcpu { vmcs: VmxRegion, io_bitmap: IOBitmap, msr_bitmap: MsrBitmap, + eptp_list: EptpList, + pending_events: VecDeque<(u8, Option)>, + // xstate: XState, xstate: XState, entry: Option, ept_root: Option, - // is_host: bool, temporary removed because we don't care about type 1.5 now + + id: usize, } impl VmxVcpu { /// Create a new [`VmxVcpu`]. - pub fn new() -> AxResult { - let vmcs_revision_id = super::read_vmcs_revision_id(); + pub fn new(id: usize) -> AxResult { let vcpu = Self { guest_regs: GeneralRegisters::default(), host_stack_top: 0, launched: false, - vmcs: VmxRegion::new(vmcs_revision_id, false)?, + vmcs: VmxRegion::new(read_vmcs_revision_id(), false)?, io_bitmap: IOBitmap::passthrough_all()?, msr_bitmap: MsrBitmap::passthrough_all()?, + eptp_list: EptpList::new()?, pending_events: VecDeque::with_capacity(8), xstate: XState::new(), entry: None, ept_root: None, - // is_host: false, + id, }; - info!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr()); + debug!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr(),); Ok(vcpu) } - /// Set the new [`VmxVcpu`] context from guest OS. - pub fn setup(&mut self, ept_root: HostPhysAddr, entry: GuestPhysAddr) -> AxResult { - self.setup_vmcs(entry, ept_root)?; - Ok(()) - } - // /// Get the identifier of this [`VmxVcpu`]. // pub fn vcpu_id(&self) -> usize { // get_current_vcpu::().unwrap().id() @@ -192,7 +103,7 @@ impl VmxVcpu { /// Bind this [`VmxVcpu`] to current logical processor. pub fn bind_to_current_processor(&self) -> AxResult { - debug!( + trace!( "VmxVcpu bind to current processor vmcs @ {:#x}", self.vmcs.phys_addr() ); @@ -205,7 +116,7 @@ impl VmxVcpu { /// Unbind this [`VmxVcpu`] from current logical processor. pub fn unbind_from_current_processor(&self) -> AxResult { - debug!( + trace!( "VmxVcpu unbind from current processor vmcs @ {:#x}", self.vmcs.phys_addr() ); @@ -260,7 +171,7 @@ impl VmxVcpu { // Handle vm-exits let exit_info = self.exit_info().unwrap(); - // debug!("VM exit: {:#x?}", exit_info); + trace!("VM exit: {:#x?}", exit_info); match self.builtin_vmexit_handler(&exit_info) { Some(result) => { @@ -324,24 +235,9 @@ impl VmxVcpu { VmcsGuestNW::RSP.write(rsp).unwrap() } - /// Translate guest virtual addr to linear addr - pub fn gla2gva(&self, guest_rip: GuestVirtAddr) -> GuestVirtAddr { - let cpu_mode = self.get_cpu_mode(); - let seg_base = if cpu_mode == VmCpuMode::Mode64 { - 0 - } else { - VmcsGuestNW::CS_BASE.read().unwrap() - }; - // debug!( - // "seg_base: {:#x}, guest_rip: {:#x} cpu mode:{:?}", - // seg_base, guest_rip, cpu_mode - // ); - guest_rip + seg_base - } - /// Get Translate guest page table info - pub fn get_ptw_info(&self) -> GuestPageWalkInfo { - let top_entry = VmcsGuestNW::CR3.read().unwrap(); + pub fn get_pagetable_walk_info(&self) -> GuestPageWalkInfo { + let cr3 = VmcsGuestNW::CR3.read().unwrap(); let level = self.get_paging_level(); let is_write_access = false; let is_inst_fetch = false; @@ -368,7 +264,7 @@ impl VmxVcpu { width = 0; } GuestPageWalkInfo { - top_entry, + cr3, level, width, is_user_mode_access, @@ -430,6 +326,60 @@ impl VmxVcpu { self.msr_bitmap.set_read_intercept(msr, intercept); self.msr_bitmap.set_write_intercept(msr, intercept); } + + pub fn read_guest_memory(&self, gva: GuestVirtAddr, len: usize) -> AxResult> { + debug!("read_guest_memory @{:?} len: {}", gva, len); + + let mut content = Vec::with_capacity(len as usize); + + let mut remained_size = len; + let mut addr = gva; + + while remained_size > 0 { + let (gpa, _flags, page_size) = self.guest_page_table_query(gva).map_err(|e| { + warn!( + "Failed to query guest page table, GVA {:?} err {:?}", + gva, e + ); + ax_err_type!(BadAddress) + })?; + let pgoff = page_size.align_offset(addr.into()); + let read_size = (page_size as usize - pgoff).min(remained_size); + addr += read_size; + remained_size -= read_size; + + if let Some((hpa, _flags, _pgsize)) = H::EPTTranslator::guest_phys_to_host_phys(gpa) { + let hva_ptr = H::PagingHandler::phys_to_virt(hpa).as_ptr(); + for i in 0..read_size { + content.push(unsafe { hva_ptr.add(i).read() }); + } + } else { + return ax_err!(BadAddress); + } + } + debug!("read_guest_memory @{:?} content: {:x?}", gva, content); + Ok(content) + } + + pub fn decode_instruction(&self, rip: GuestVirtAddr, instr_len: usize) -> AxResult { + use alloc::string::String; + use iced_x86::{Decoder, DecoderOptions, Formatter, IntelFormatter}; + + let bytes = self.read_guest_memory(rip, instr_len)?; + let mut decoder = Decoder::with_ip( + 64, + bytes.as_slice(), + rip.as_usize() as _, + DecoderOptions::NONE, + ); + let instr = decoder.decode(); + let mut output = String::new(); + let mut formattor = IntelFormatter::new(); + formattor.format(&instr, &mut output); + + debug!("Decoded instruction @Intel formatter: {}", output); + Ok(()) + } } // Implementation of private methods @@ -500,14 +450,31 @@ impl VmxVcpu { Ok(()) } - fn setup_vmcs(&mut self, entry: GuestPhysAddr, ept_root: HostPhysAddr) -> AxResult { + fn setup_vmcs( + &mut self, + ept_root: HostPhysAddr, + entry: Option, + ctx: Option, + ) -> AxResult { + let mut is_guest = true; + let paddr = self.vmcs.phys_addr().as_usize() as u64; unsafe { vmx::vmclear(paddr).map_err(as_axerr)?; } self.bind_to_current_processor()?; - self.setup_vmcs_guest(entry)?; - self.setup_vmcs_control(ept_root, true)?; + + if let Some(ctx) = ctx { + is_guest = false; + self.setup_vmcs_guest_from_ctx(ctx)?; + } else { + self.setup_vmcs_guest(entry.ok_or_else(|| { + error!("VmxVcpu::setup_vmcs: entry is None"); + ax_err_type!(InvalidInput) + })?)?; + } + + self.setup_vmcs_control(ept_root, is_guest)?; self.unbind_from_current_processor()?; Ok(()) } @@ -549,6 +516,66 @@ impl VmxVcpu { Ok(()) } + /// Indeed, this function can be combined with `setup_vmcs_guest`, + /// to avoid complexity and minimize the modification, + /// we just keep them separated. + fn setup_vmcs_guest_from_ctx(&mut self, host_ctx: LinuxContext) -> AxResult { + let linux = host_ctx; + + self.set_cr(0, linux.cr0.bits()); + self.set_cr(4, linux.cr4.bits()); + self.set_cr(3, linux.cr3); + + macro_rules! set_guest_segment { + ($seg: expr, $reg: ident) => {{ + use VmcsGuest16::*; + use VmcsGuest32::*; + use VmcsGuestNW::*; + concat_idents!($reg, _SELECTOR).write($seg.selector.bits())?; + concat_idents!($reg, _BASE).write($seg.base as _)?; + concat_idents!($reg, _LIMIT).write($seg.limit)?; + concat_idents!($reg, _ACCESS_RIGHTS).write($seg.access_rights.bits())?; + }}; + } + + set_guest_segment!(linux.es, ES); + set_guest_segment!(linux.cs, CS); + set_guest_segment!(linux.ss, SS); + set_guest_segment!(linux.ds, DS); + set_guest_segment!(linux.fs, FS); + set_guest_segment!(linux.gs, GS); + set_guest_segment!(linux.tss, TR); + set_guest_segment!(Segment::invalid(), LDTR); + + VmcsGuestNW::GDTR_BASE.write(linux.gdt.base.as_u64() as _)?; + VmcsGuest32::GDTR_LIMIT.write(linux.gdt.limit as _)?; + VmcsGuestNW::IDTR_BASE.write(linux.idt.base.as_u64() as _)?; + VmcsGuest32::IDTR_LIMIT.write(linux.idt.limit as _)?; + + VmcsGuestNW::RSP.write(linux.rsp as _)?; + VmcsGuestNW::RIP.write(linux.rip as _)?; + VmcsGuestNW::RFLAGS.write(0x2)?; + + VmcsGuest32::IA32_SYSENTER_CS.write(linux.ia32_sysenter_cs as _)?; + VmcsGuestNW::IA32_SYSENTER_ESP.write(linux.ia32_sysenter_esp as _)?; + VmcsGuestNW::IA32_SYSENTER_EIP.write(linux.ia32_sysenter_eip as _)?; + + VmcsGuestNW::DR7.write(0x400)?; + VmcsGuest64::IA32_DEBUGCTL.write(0)?; + + VmcsGuest32::ACTIVITY_STATE.write(0)?; + VmcsGuest32::INTERRUPTIBILITY_STATE.write(0)?; + VmcsGuestNW::PENDING_DBG_EXCEPTIONS.write(0)?; + + VmcsGuest64::LINK_PTR.write(u64::MAX)?; + VmcsGuest32::VMX_PREEMPTION_TIMER_VALUE.write(0)?; + + VmcsGuest64::IA32_PAT.write(linux.pat)?; + VmcsGuest64::IA32_EFER.write(linux.efer.bits())?; + + Ok(()) + } + fn setup_vmcs_guest(&mut self, entry: GuestPhysAddr) -> AxResult { let cr0_val: Cr0Flags = Cr0Flags::NOT_WRITE_THROUGH | Cr0Flags::CACHE_DISABLE | Cr0Flags::EXTENSION_TYPE; @@ -637,22 +664,30 @@ impl VmxVcpu { // Enable EPT, RDTSCP, INVPCID, and unrestricted guest. use SecondaryControls as CpuCtrl2; - let mut val = CpuCtrl2::ENABLE_EPT | CpuCtrl2::UNRESTRICTED_GUEST; + let mut val = + CpuCtrl2::ENABLE_EPT | CpuCtrl2::UNRESTRICTED_GUEST | CpuCtrl2::ENABLE_VM_FUNCTIONS; + if let Some(features) = raw_cpuid.get_extended_processor_and_feature_identifiers() { if features.has_rdtscp() { val |= CpuCtrl2::ENABLE_RDTSCP; } } + if let Some(features) = raw_cpuid.get_extended_feature_info() { if features.has_invpcid() { val |= CpuCtrl2::ENABLE_INVPCID; } + if features.has_waitpkg() { + val |= CpuCtrl2::ENABLE_USER_WAIT_PAUSE; + } } + if let Some(features) = raw_cpuid.get_extended_state_info() { if features.has_xsaves_xrstors() { val |= CpuCtrl2::ENABLE_XSAVES_XRSTORS; } } + vmcs::set_control( VmcsControl32::SECONDARY_PROCBASED_EXEC_CONTROLS, Msr::IA32_VMX_PROCBASED_CTLS2, @@ -703,9 +738,17 @@ impl VmxVcpu { VmcsControl32::VMEXIT_MSR_LOAD_COUNT.write(0)?; VmcsControl32::VMENTRY_MSR_LOAD_COUNT.write(0)?; - // VmcsControlNW::CR4_GUEST_HOST_MASK.write(0)?; + // TODO: figure out why we mask it. + VmcsControlNW::CR4_GUEST_HOST_MASK.write(0)?; VmcsControl32::CR3_TARGET_COUNT.write(0)?; + // 25.6.14 VM-Function Controls + // Table 25-10. Definitions of VM-Function Controls + // Bit 0: EPTP switching + VmcsControl64::VM_FUNCTION_CONTROLS.write(0b1)?; + + VmcsControl64::EPTP_LIST_ADDR.write(self.eptp_list.phys_addr().as_usize() as _)?; + // Pass-through exceptions (except #UD(6)), don't use I/O bitmap, set MSR bitmaps. let exception_bitmap: u32 = 1 << 6; @@ -718,6 +761,44 @@ impl VmxVcpu { Ok(()) } + fn load_vmcs_guest(&self, linux: &mut LinuxContext) -> AxResult { + linux.rip = VmcsGuestNW::RIP.read()? as _; + linux.rsp = VmcsGuestNW::RSP.read()? as _; + linux.cr0 = Cr0Flags::from_bits_truncate(VmcsGuestNW::CR0.read()? as _); + linux.cr3 = VmcsGuestNW::CR3.read()? as _; + linux.cr4 = Cr4Flags::from_bits_truncate(VmcsGuestNW::CR4.read()? as _); + + linux.es.selector = SegmentSelector::from_raw(VmcsGuest16::ES_SELECTOR.read()?); + + linux.cs.selector = SegmentSelector::from_raw(VmcsGuest16::CS_SELECTOR.read()?); + // CS: + // If the Type is 9 or 11 (non-conforming code segment), the DPL must equal the DPL in the access-rights field for SS. + linux.cs.access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::CS_ACCESS_RIGHTS.read()?); + linux.ss.selector = SegmentSelector::from_raw(VmcsGuest16::SS_SELECTOR.read()?); + linux.ss.access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::SS_ACCESS_RIGHTS.read()?); + + linux.ds.selector = SegmentSelector::from_raw(VmcsGuest16::DS_SELECTOR.read()?); + linux.fs.selector = SegmentSelector::from_raw(VmcsGuest16::FS_SELECTOR.read()?); + linux.fs.base = VmcsGuestNW::FS_BASE.read()? as _; + linux.gs.selector = SegmentSelector::from_raw(VmcsGuest16::GS_SELECTOR.read()?); + linux.gs.base = VmcsGuestNW::GS_BASE.read()? as _; + linux.tss.selector = SegmentSelector::from_raw(VmcsGuest16::TR_SELECTOR.read()?); + + linux.gdt.base = VirtAddr::new(VmcsGuestNW::GDTR_BASE.read()? as _); + linux.gdt.limit = VmcsGuest32::GDTR_LIMIT.read()? as _; + linux.idt.base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read()? as _); + linux.idt.limit = VmcsGuest32::IDTR_LIMIT.read()? as _; + + linux.ia32_sysenter_cs = VmcsGuest32::IA32_SYSENTER_CS.read()? as _; // 0x174 + linux.ia32_sysenter_esp = VmcsGuestNW::IA32_SYSENTER_ESP.read()? as _; // 0x178 + linux.ia32_sysenter_eip = VmcsGuestNW::IA32_SYSENTER_EIP.read()? as _; // 0x17a + + linux.load_guest_regs(self.regs()); + Ok(()) + } + fn get_paging_level(&self) -> usize { let mut level: u32 = 0; // non-paging let cr0 = VmcsGuestNW::CR0.read().unwrap(); @@ -738,6 +819,32 @@ impl VmxVcpu { } level as usize } + + /// Translate guest virtual addr to linear addr + fn gva_to_linear_addr(&self, vaddr: GuestVirtAddr) -> GuestVirtAddr { + let cpu_mode = self.get_cpu_mode(); + let seg_base = if cpu_mode == VmCpuMode::Mode64 { + 0 + } else { + VmcsGuestNW::CS_BASE.read().unwrap() + }; + vaddr + seg_base + } + + pub fn guest_page_table_query( + &self, + gva: GuestVirtAddr, + ) -> PagingResult<(GuestPhysAddr, MappingFlags, PageSize)> { + let addr = self.gva_to_linear_addr(gva); + + // debug!("guest_page_table_query: gva {:?} linear {:?}", gva, addr); + + let guest_ptw_info = self.get_pagetable_walk_info(); + let guest_page_table: GuestPageTable64 = + GuestPageTable64::construct(&guest_ptw_info); + + guest_page_table.query(addr) + } } // Implementaton for type1.5 hypervisor @@ -901,6 +1008,7 @@ impl VmxVcpu { VmxExitReason::XSETBV => Some(self.handle_xsetbv()), VmxExitReason::CR_ACCESS => Some(self.handle_cr()), VmxExitReason::CPUID => Some(self.handle_cpuid()), + VmxExitReason::EXCEPTION_NMI => Some(self.handle_exception_nmi(exit_info)), _ => None, } } @@ -915,6 +1023,30 @@ impl VmxVcpu { Ok(()) } + fn handle_exception_nmi(&mut self, exit_info: &VmxExitInfo) -> AxResult { + let intr_info = interrupt_exit_info()?; + info!( + "VM exit: Exception or NMI @ RIP({:#x}, {}): {:#x?}", + exit_info.guest_rip, exit_info.exit_instruction_length, intr_info + ); + + self.decode_instruction( + GuestVirtAddr::from_usize(exit_info.guest_rip), + exit_info.exit_instruction_length as _, + )?; + + const NON_MASKABLE_INTERRUPT: u8 = 2; + + match intr_info.vector { + // ExceptionType::NonMaskableInterrupt + NON_MASKABLE_INTERRUPT => unsafe { + core::arch::asm!("int {}", const NON_MASKABLE_INTERRUPT) + }, + v => panic!("Unhandled Guest Exception: #{:#x}", v), + } + Ok(()) + } + #[allow(clippy::single_match)] fn handle_cr(&mut self) -> AxResult { const VM_EXIT_INSTR_LEN_MV_TO_CR: u8 = 3; @@ -994,7 +1126,6 @@ impl VmxVcpu { self.load_guest_xstate(); let res = cpuid!(regs_clone.rax, regs_clone.rcx); self.load_host_xstate(); - res } LEAF_HYPERVISOR_INFO => CpuIdResult { @@ -1088,12 +1219,21 @@ impl VmxVcpu { } } + /// Save the current host state to the vcpu, + /// restore the guest state from the vcpu into registers. + /// + /// This function is generally called before VM-entry. fn load_guest_xstate(&mut self) { - self.xstate.switch_to_guest(); + // FIXME: Linux will throw a UD exception if we save/restore xstate. + // self.xstate.switch_to_guest(); } + /// Save the current guest state to the vcpu, + /// restore the host state from the vcpu into registers. + /// + /// This function is generally called after VM-exit. fn load_host_xstate(&mut self) { - self.xstate.switch_to_host(); + // self.xstate.switch_to_host(); } } @@ -1123,6 +1263,33 @@ fn get_tr_base(tr: SegmentSelector, gdt: &DescriptorTablePointer) -> u64 { impl Debug for VmxVcpu { fn fmt(&self, f: &mut Formatter) -> Result { (|| -> AxResult { + let cs_selector = SegmentSelector::from_raw(VmcsGuest16::CS_SELECTOR.read()?); + let cs_access_rights_raw = VmcsGuest32::CS_ACCESS_RIGHTS.read()?; + let cs_access_rights = SegmentAccessRights::from_bits_truncate(cs_access_rights_raw); + let ss_selector = SegmentSelector::from_raw(VmcsGuest16::SS_SELECTOR.read()?); + let ss_access_rights_raw = VmcsGuest32::SS_ACCESS_RIGHTS.read()?; + let ss_access_rights = SegmentAccessRights::from_bits_truncate(ss_access_rights_raw); + let ds_selector = SegmentSelector::from_raw(VmcsGuest16::DS_SELECTOR.read()?); + let ds_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::DS_ACCESS_RIGHTS.read()?); + let fs_selector = SegmentSelector::from_raw(VmcsGuest16::FS_SELECTOR.read()?); + let fs_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::FS_ACCESS_RIGHTS.read()?); + let gs_selector = SegmentSelector::from_raw(VmcsGuest16::GS_SELECTOR.read()?); + let gs_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::GS_ACCESS_RIGHTS.read()?); + let tr_selector = SegmentSelector::from_raw(VmcsGuest16::TR_SELECTOR.read()?); + let tr_access_rights = + SegmentAccessRights::from_bits_truncate(VmcsGuest32::TR_ACCESS_RIGHTS.read()?); + let gdt_base = VirtAddr::new(VmcsGuestNW::GDTR_BASE.read()? as _); + let gdt_limit = VmcsGuest32::GDTR_LIMIT.read()?; + let idt_base = VirtAddr::new(VmcsGuestNW::IDTR_BASE.read()? as _); + let idt_limit = VmcsGuest32::IDTR_LIMIT.read()?; + + let ia32_sysenter_cs = VmcsGuest32::IA32_SYSENTER_CS.read()?; + let ia32_sysenter_esp = VmcsGuestNW::IA32_SYSENTER_ESP.read()?; + let ia32_sysenter_eip = VmcsGuestNW::IA32_SYSENTER_EIP.read()?; + Ok(f.debug_struct("VmxVcpu") .field("guest_regs", &self.guest_regs) .field("rip", &VmcsGuestNW::RIP.read()?) @@ -1131,10 +1298,32 @@ impl Debug for VmxVcpu { .field("cr0", &VmcsGuestNW::CR0.read()?) .field("cr3", &VmcsGuestNW::CR3.read()?) .field("cr4", &VmcsGuestNW::CR4.read()?) - .field("cs", &VmcsGuest16::CS_SELECTOR.read()?) + .field("cs_base", &VmcsGuestNW::CS_BASE.read()?) + .field("cs_selector", &cs_selector) + .field("cs_access_rights", &cs_access_rights) + .field("cs_access_rights_raw", &cs_access_rights_raw) + .field("ss_base", &VmcsGuestNW::SS_BASE.read()?) + .field("ss_selector", &ss_selector) + .field("ss_access_rights_raw", &ss_access_rights_raw) + .field("ss_access_rights", &ss_access_rights) + .field("ds_base", &VmcsGuestNW::DS_BASE.read()?) + .field("ds_selector", &ds_selector) + .field("ds_access_rights", &ds_access_rights) .field("fs_base", &VmcsGuestNW::FS_BASE.read()?) + .field("fs_selector", &fs_selector) + .field("fs_access_rights", &fs_access_rights) .field("gs_base", &VmcsGuestNW::GS_BASE.read()?) - .field("tss", &VmcsGuest16::TR_SELECTOR.read()?) + .field("gs_selector", &gs_selector) + .field("gs_access_rights", &gs_access_rights) + .field("tr_selector", &tr_selector) + .field("tr_access_rights", &tr_access_rights) + .field("gdt_base", &gdt_base) + .field("gdt_limit", &gdt_limit) + .field("idt_base", &idt_base) + .field("idt_limit", &idt_limit) + .field("ia32_sysenter_cs", &ia32_sysenter_cs) + .field("ia32_sysenter_esp", &ia32_sysenter_esp) + .field("ia32_sysenter_eip", &ia32_sysenter_eip) .finish()) })() .unwrap() @@ -1142,12 +1331,21 @@ impl Debug for VmxVcpu { } impl AxArchVCpu for VmxVcpu { - type CreateConfig = (); + type CreateConfig = usize; type SetupConfig = (); - fn new(_config: Self::CreateConfig) -> AxResult { - Self::new() + type HostContext = crate::context::LinuxContext; + + fn new(id: Self::CreateConfig) -> AxResult { + Self::new(id) + } + + fn load_context(&self, config: &mut Self::HostContext) -> AxResult { + // info!("Loading context {:#x?}", self); + + self.load_vmcs_guest(config)?; + Ok(()) } fn set_entry(&mut self, entry: GuestPhysAddr) -> AxResult { @@ -1161,12 +1359,32 @@ impl AxArchVCpu for VmxVcpu { } fn setup(&mut self, _config: Self::SetupConfig) -> AxResult { - self.setup_vmcs(self.entry.unwrap(), self.ept_root.unwrap()) + self.setup_vmcs(self.ept_root.unwrap(), self.entry, None) + } + + fn setup_from_context(&mut self, ctx: Self::HostContext) -> AxResult { + self.guest_regs.load_from_context(&ctx); + self.setup_vmcs(self.ept_root.unwrap(), None, Some(ctx)) } fn run(&mut self) -> AxResult { match self.inner_run() { Some(exit_info) => Ok(if exit_info.entry_failure { + match exit_info.exit_reason { + VmxExitReason::INVALID_GUEST_STATE + | VmxExitReason::MCE_DURING_VMENTRY + | VmxExitReason::MSR_LOAD_FAIL => {} + _ => { + error!("Invalid exit reasion when entry failure: {:#x?}", exit_info); + } + }; + + let exit_qualification = exit_qualification()?; + + warn!("VMX entry failure: {:#x?}", exit_info); + warn!("Exit qualification: {:#x?}", exit_qualification); + warn!("VCpu {:#x?}", self); + AxVCpuExitReason::FailEntry { // Todo: get `hardware_entry_failure_reason` somehow. hardware_entry_failure_reason: 0, @@ -1229,6 +1447,23 @@ impl AxArchVCpu for VmxVcpu { } } } + VmxExitReason::EPT_VIOLATION => { + let ept_info = self.nested_page_fault_info()?; + self.decode_instruction( + GuestVirtAddr::from_usize(exit_info.guest_rip), + exit_info.exit_instruction_length as _, + )?; + + AxVCpuExitReason::NestedPageFault { + addr: ept_info.fault_guest_paddr, + access_flags: ept_info.access_flags, + } + } + VmxExitReason::TRIPLE_FAULT => { + error!("VMX triple fault: {:#x?}", exit_info); + error!("VCpu {:#x?}", self); + AxVCpuExitReason::Halt + } _ => { warn!("VMX unsupported VM-Exit: {:#x?}", exit_info); warn!("VCpu {:#x?}", self); @@ -1253,3 +1488,81 @@ impl AxArchVCpu for VmxVcpu { self.regs_mut().set_reg_of_index(reg as u8, val as u64); } } + +impl AxVcpuAccessGuestState for VmxVcpu { + type GeneralRegisters = GeneralRegisters; + + fn regs(&self) -> &Self::GeneralRegisters { + self.regs() + } + + fn regs_mut(&mut self) -> &mut Self::GeneralRegisters { + self.regs_mut() + } + + fn read_gpr(&self, reg: usize) -> usize { + self.regs().get_reg_of_index(reg as u8) as usize + } + + fn write_gpr(&mut self, reg: usize, val: usize) { + self.regs_mut().set_reg_of_index(reg as u8, val as u64); + } + + fn instr_pointer(&self) -> usize { + VmcsGuestNW::RIP.read().expect("Failed to read RIP") as usize + } + + fn set_instr_pointer(&mut self, val: usize) { + VmcsGuestNW::RIP.write(val as _).expect("Failed to set RIP"); + } + + fn stack_pointer(&self) -> usize { + self.stack_pointer() + } + + fn set_stack_pointer(&mut self, val: usize) { + self.set_stack_pointer(val); + } + + fn frame_pointer(&self) -> usize { + self.regs().rbp as usize + } + + fn set_frame_pointer(&mut self, val: usize) { + self.regs_mut().rbp = val as u64; + } + + fn return_value(&self) -> usize { + self.regs().rax as usize + } + + fn set_return_value(&mut self, val: usize) { + self.regs_mut().rax = val as u64; + } + + fn guest_is_privileged(&self) -> bool { + use crate::segmentation::SegmentAccessRights; + SegmentAccessRights::from_bits_truncate( + VmcsGuest32::CS_ACCESS_RIGHTS + .read() + .expect("Failed to read CS_ACCESS_RIGHTS"), + ) + .dpl() + == 0 + } + + fn guest_page_table_query( + &self, + gva: GuestVirtAddr, + ) -> Option<(GuestPhysAddr, MappingFlags, PageSize)> { + self.guest_page_table_query(gva).ok() + } + + fn current_ept_root(&self) -> HostPhysAddr { + vmcs::get_ept_pointer() + } + + fn eptp_list_region(&self) -> HostPhysAddr { + self.eptp_list.phys_addr() + } +} diff --git a/src/vmx/vmcs.rs b/src/vmx/vmcs.rs index 2b8ba99..8b6e648 100644 --- a/src/vmx/vmcs.rs +++ b/src/vmx/vmcs.rs @@ -597,7 +597,7 @@ pub fn set_control( let allowed0 = cap as u32; let allowed1 = (cap >> 32) as u32; assert_eq!(allowed0 & allowed1, allowed0); - debug!( + trace!( "set {:?}: {:#x} (+{:#x}, -{:#x})", control, old_value, set, clear ); @@ -638,6 +638,10 @@ pub fn set_ept_pointer(pml4_paddr: HostPhysAddr) -> AxResult { Ok(()) } +pub fn get_ept_pointer() -> HostPhysAddr { + HostPhysAddr::from(VmcsControl64::EPTP.read().expect("Failed to read EPTP") as usize) +} + pub fn instruction_error() -> VmxInstructionError { VmcsReadOnly32::VM_INSTRUCTION_ERROR.read().unwrap().into() } @@ -660,7 +664,7 @@ pub fn raw_interrupt_exit_info() -> AxResult { } pub fn interrupt_exit_info() -> AxResult { - // SDM Vol. 3C, Section 24.9.2 + // SDM Vol. 3C, Section 25.9.2 let info = VmcsReadOnly32::VMEXIT_INTERRUPTION_INFO.read()?; Ok(VmxInterruptInfo { vector: info.get_bits(0..8) as u8, @@ -772,3 +776,7 @@ pub fn cr_access_info() -> AxResult { lmsw_source_data: qualification.get_bits(16..32) as u8, }) } + +pub fn exit_qualification() -> AxResult { + VmcsReadOnlyNW::EXIT_QUALIFICATION.read() +} diff --git a/src/xstate.rs b/src/xstate.rs new file mode 100644 index 0000000..becb471 --- /dev/null +++ b/src/xstate.rs @@ -0,0 +1,109 @@ +use raw_cpuid::CpuId; +use x86::controlregs::{Xcr0, xcr0 as xcr0_read, xcr0_write}; +use x86_64::registers::control::{Cr4, Cr4Flags}; + +use crate::msr::Msr; + +#[allow(unused)] +pub struct XState { + host_xcr0: u64, + pub(crate) guest_xcr0: u64, + host_xss: u64, + guest_xss: u64, + + xsave_available: bool, + xsaves_available: bool, +} + +impl XState { + /// Create a new [`XState`] instance with current host state + pub fn new() -> Self { + // Check if XSAVE is available + let xsave_available = Self::xsave_available(); + // Check if XSAVES and XRSTORS (as well as IA32_XSS) are available + let xsaves_available = if xsave_available { + Self::xsaves_available() + } else { + false + }; + + // Read XCR0 iff XSAVE is available + let xcr0 = if xsave_available { + unsafe { xcr0_read().bits() } + } else { + 0 + }; + // Read IA32_XSS iff XSAVES is available + let xss = if xsaves_available { + Msr::IA32_XSS.read() + } else { + 0 + }; + + Self { + host_xcr0: xcr0, + guest_xcr0: xcr0, + host_xss: xss, + guest_xss: xss, + xsave_available, + xsaves_available, + } + } + + /// Enable extended processor state management instructions, including XGETBV and XSAVE. + pub fn enable_xsave() { + if Self::xsave_available() { + unsafe { Cr4::write(Cr4::read() | Cr4Flags::OSXSAVE) }; + } + } + + /// Check if XSAVE is available on the current CPU. + pub fn xsave_available() -> bool { + let cpuid = CpuId::new(); + cpuid + .get_feature_info() + .map(|f| f.has_xsave()) + .unwrap_or(false) + } + + /// Check if XSAVES and XRSTORS (as well as IA32_XSS) are available on the current CPU. + pub fn xsaves_available() -> bool { + let cpuid = CpuId::new(); + cpuid + .get_extended_state_info() + .map(|f| f.has_xsaves_xrstors()) + .unwrap_or(false) + } + + /// Save the current host XCR0 and IA32_XSS values and load the guest values. + #[allow(unused)] + pub fn switch_to_guest(&mut self) { + unsafe { + if self.xsave_available { + self.host_xcr0 = xcr0_read().bits(); + xcr0_write(Xcr0::from_bits_unchecked(self.guest_xcr0)); + + if self.xsaves_available { + self.host_xss = Msr::IA32_XSS.read(); + Msr::IA32_XSS.write(self.guest_xss); + } + } + } + } + + /// Save the current guest XCR0 and IA32_XSS values and load the host values. + #[allow(unused)] + pub fn switch_to_host(&mut self) { + unsafe { + if self.xsave_available { + self.guest_xcr0 = xcr0_read().bits(); + xcr0_write(Xcr0::from_bits_unchecked(self.host_xcr0)); + + if self.xsaves_available { + self.guest_xss = Msr::IA32_XSS.read(); + Msr::IA32_XSS.write(self.host_xss); + } + } + } + } +}