diff --git a/executor/common_kvm_amd64.h b/executor/common_kvm_amd64.h index e4c961962042..0c49f07a0931 100644 --- a/executor/common_kvm_amd64.h +++ b/executor/common_kvm_amd64.h @@ -238,6 +238,8 @@ static const struct mem_region syzos_mem_regions[] = { {X86_SYZOS_ADDR_SCRATCH_CODE, 1, 0}, // CPU stack. {X86_SYZOS_ADDR_STACK_BOTTOM, 1, 0}, + // Per-VCPU regions for L2 VMs. + {X86_SYZOS_PER_VCPU_REGIONS_BASE, (KVM_MAX_VCPU * X86_SYZOS_L1_VCPU_REGION_SIZE) / KVM_PAGE_SIZE, 0}, // IOAPIC memory. {X86_SYZOS_ADDR_IOAPIC, 1, 0}, }; @@ -397,11 +399,21 @@ static void setup_gdt_64(struct gdt_entry* gdt) // P=1, DPL=0, S=1, Type=Read/Write, DB=1, G=1 gdt[X86_SYZOS_SEL_DATA >> 3] = (struct gdt_entry){ .limit_low = 0xFFFF, - .base_low = 0, - .base_mid = 0, + .base_low = (uint16)(X86_SYZOS_ADDR_VAR_TSS & 0xFFFF), + .base_mid = (uint8)((X86_SYZOS_ADDR_VAR_TSS >> 16) & 0xFF), .access = 0x92, // Present, DPL=0, S=1, Type=Read/Write, Accessed .limit_high_and_flags = 0xCF, // Granularity=1, DB=1, Limit=0xF + .base_high = (uint8)((X86_SYZOS_ADDR_VAR_TSS >> 24) & 0xFF)}; + // Entry 3 (selector 0x18): 64-bit TSS Segment + gdt[X86_SYZOS_SEL_TSS64 >> 3] = (struct gdt_entry){ + .limit_low = 0x67, // Minimal TSS limit + .base_low = 0, + .base_mid = 0, + .access = 0x89, // Present, DPL=0, 64-bit TSS (Available) + .limit_high_and_flags = 0x00, // G=0, Limit High = 0 .base_high = 0}; + // NOTE: A 64-bit TSS descriptor actually needs a second GDT entry for the high 32 bits of the base. + // We'll keep the base 0 for simplicity, so the second entry (index 4) can remain 0. } // This only sets up a 64-bit VCPU. @@ -412,7 +424,7 @@ static void setup_gdt_ldt_pg(struct kvm_syz_vm* vm, int cpufd) ioctl(cpufd, KVM_GET_SREGS, &sregs); sregs.gdt.base = X86_SYZOS_ADDR_GDT; - sregs.gdt.limit = 3 * sizeof(struct gdt_entry) - 1; + sregs.gdt.limit = 5 * sizeof(struct gdt_entry) - 1; struct gdt_entry* gdt = (struct gdt_entry*)((uint64)vm->host_mem + sregs.gdt.base); struct kvm_segment seg_cs64; @@ -444,6 +456,29 @@ static void setup_gdt_ldt_pg(struct kvm_syz_vm* vm, int cpufd) sregs.gs = seg_ds64; sregs.ss = seg_ds64; + // The L1 guest (the host for L2) MUST have a valid TR + // pointing to the 64-bit TSS in the GDT. + struct kvm_segment seg_tr; + memset(&seg_tr, 0, sizeof(seg_tr)); + seg_tr.selector = X86_SYZOS_SEL_TSS64; // 0x18 + seg_tr.type = 11; // 64-bit TSS (Busy) + seg_tr.base = X86_SYZOS_ADDR_VAR_TSS; + seg_tr.limit = 0x67; // Limit of the TSS descriptor + seg_tr.present = 1; + seg_tr.s = 0; // System segment + sregs.tr = seg_tr; + + // The L1 TSS memory is at (vm->host_mem + X86_SYZOS_ADDR_VAR_TSS) + volatile uint8* l1_tss = + (volatile uint8*)((uint64)vm->host_mem + X86_SYZOS_ADDR_VAR_TSS); + + // Zero out the TSS (104 bytes for 64-bit) + memset((void*)l1_tss, 0, 104); + + // Set the critical RSP0 field to the L1 guest's main stack. + // RSP0 is at offset +4 bytes in a 64-bit TSS. + *(volatile uint64*)(l1_tss + 4) = X86_SYZOS_ADDR_STACK0; + setup_gdt_64(gdt); syzos_setup_idt(vm, &sregs); diff --git a/executor/common_kvm_amd64_syzos.h b/executor/common_kvm_amd64_syzos.h index fd43b9daab36..26e1297a2319 100644 --- a/executor/common_kvm_amd64_syzos.h +++ b/executor/common_kvm_amd64_syzos.h @@ -11,21 +11,25 @@ #include #include -// Compilers will eagerly try to transform the switch statement in guest_main() -// into a jump table, unless the cases are sparse enough. -// We use prime numbers multiplied by 10 to prevent this behavior. +// There are no particular rules to assign numbers here, but changing them will +// result in losing some existing reproducers. Therefore, we try to leave spaces +// between unrelated IDs. // Remember these constants must match those in sys/linux/dev_kvm_amd64.txt. typedef enum { SYZOS_API_UEXIT = 0, SYZOS_API_CODE = 10, - SYZOS_API_CPUID = 20, - SYZOS_API_WRMSR = 30, - SYZOS_API_RDMSR = 50, - SYZOS_API_WR_CRN = 70, - SYZOS_API_WR_DRN = 110, - SYZOS_API_IN_DX = 130, - SYZOS_API_OUT_DX = 170, - SYZOS_API_SET_IRQ_HANDLER = 190, + SYZOS_API_CPUID = 100, + SYZOS_API_WRMSR = 101, + SYZOS_API_RDMSR = 102, + SYZOS_API_WR_CRN = 103, + SYZOS_API_WR_DRN = 104, + SYZOS_API_IN_DX = 105, + SYZOS_API_OUT_DX = 106, + SYZOS_API_SET_IRQ_HANDLER = 200, + SYZOS_API_ENABLE_NESTED = 300, + SYZOS_API_NESTED_CREATE_VM = 301, + SYZOS_API_NESTED_LOAD_CODE = 302, + SYZOS_API_NESTED_VMLAUNCH = 303, SYZOS_API_STOP, // Must be the last one } syzos_api_id; @@ -44,6 +48,12 @@ struct api_call_code { uint8 insns[]; }; +struct api_call_nested_load_code { + struct api_call_header header; + uint64 vm_id; + uint8 insns[]; +}; + struct api_call_cpuid { struct api_call_header header; uint32 eax; @@ -65,10 +75,17 @@ struct api_call_3 { uint64 args[3]; }; +// This struct must match the push/pop order in nested_vm_exit_handler_intel_asm(). +struct l2_guest_regs { + uint64 rax, rbx, rcx, rdx, rsi, rdi, rbp; + uint64 r8, r9, r10, r11, r12, r13, r14, r15; +}; + #ifdef __cplusplus extern "C" { #endif GUEST_CODE static void guest_uexit(uint64 exit_code); +GUEST_CODE static void nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs); #ifdef __cplusplus } #endif @@ -81,13 +98,23 @@ GUEST_CODE static void guest_handle_wr_drn(struct api_call_2* cmd); GUEST_CODE static void guest_handle_in_dx(struct api_call_2* cmd); GUEST_CODE static void guest_handle_out_dx(struct api_call_3* cmd); GUEST_CODE static void guest_handle_set_irq_handler(struct api_call_2* cmd); +GUEST_CODE static void guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id); +GUEST_CODE static void guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id); +GUEST_CODE static void guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id); +GUEST_CODE static void guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id); typedef enum { UEXIT_END = (uint64)-1, UEXIT_IRQ = (uint64)-2, UEXIT_ASSERT = (uint64)-3, + UEXIT_STOP_L2 = (uint64)-4, } uexit_code; +typedef enum { + CPU_VENDOR_INTEL, + CPU_VENDOR_AMD, +} cpu_vendor_id; + __attribute__((naked)) GUEST_CODE static void dummy_null_handler() @@ -170,6 +197,18 @@ guest_main(uint64 size, uint64 cpu) } else if (call == SYZOS_API_SET_IRQ_HANDLER) { // Set the handler for a particular IRQ. guest_handle_set_irq_handler((struct api_call_2*)cmd); + } else if (call == SYZOS_API_ENABLE_NESTED) { + // Enable nested virtualization. + guest_handle_enable_nested((struct api_call_1*)cmd, cpu); + } else if (call == SYZOS_API_NESTED_CREATE_VM) { + // Create a nested VM. + guest_handle_nested_create_vm((struct api_call_1*)cmd, cpu); + } else if (call == SYZOS_API_NESTED_LOAD_CODE) { + // Load code into the nested VM. + guest_handle_nested_load_code((struct api_call_nested_load_code*)cmd, cpu); + } else if (call == SYZOS_API_NESTED_VMLAUNCH) { + // Launch the nested VM. + guest_handle_nested_vmlaunch((struct api_call_1*)cmd, cpu); } addr += cmd->size; size -= cmd->size; @@ -398,4 +437,735 @@ GUEST_CODE static noinline void guest_handle_set_irq_handler(struct api_call_2* set_idt_gate(vector, handler_addr); } +GUEST_CODE static cpu_vendor_id get_cpu_vendor(void) +{ + uint32 ebx, eax = 0; + + asm volatile( + "cpuid" + : "+a"(eax), "=b"(ebx) + : // No explicit inputs, EAX is handled by +a. + : "ecx", "edx"); + + if (ebx == 0x756e6547) { // "Genu[ineIntel]". + return CPU_VENDOR_INTEL; + } else if (ebx == 0x68747541) { // "Auth[enticAMD]". + return CPU_VENDOR_AMD; + } else { + // Should not happen on AMD64, but for completeness. + guest_uexit(UEXIT_ASSERT); + return CPU_VENDOR_INTEL; // Default to Intel if unknown. + } +} + +GUEST_CODE static inline uint64 read_cr0(void) +{ + uint64 val; + asm volatile("mov %%cr0, %0" : "=r"(val)); + return val; +} + +GUEST_CODE static inline uint64 read_cr3(void) +{ + uint64 val; + asm volatile("mov %%cr3, %0" : "=r"(val)); + return val; +} + +GUEST_CODE static inline uint64 read_cr4(void) +{ + uint64 val; + asm volatile("mov %%cr4, %0" : "=r"(val)); + return val; +} + +GUEST_CODE static inline void write_cr4(uint64 val) +{ + asm volatile("mov %0, %%cr4" : : "r"(val)); +} + +GUEST_CODE static noinline void wrmsr(uint64 reg, uint64 val) +{ + asm volatile( + "wrmsr" + : + : "c"(reg), + "a"((uint32)val), + "d"((uint32)(val >> 32)) + : "memory"); +} + +GUEST_CODE static noinline uint64 rdmsr(uint32 msr_id) +{ + uint64 msr_value; + asm volatile("rdmsr" : "=A"(msr_value) : "c"(msr_id)); + return msr_value; +} + +GUEST_CODE static noinline void vmwrite(uint64 field, uint64 value) +{ + uint8 error = 0; // nolint + // 'setna' sets the byte to 1 if CF=1 or ZF=1 (VMfail) + asm volatile("vmwrite %%rax, %%rbx; setna %0" + : "=q"(error) + : "a"(value), "b"(field) + : "cc", "memory"); + if (error) + guest_uexit(UEXIT_ASSERT); +} + +GUEST_CODE static noinline uint64 vmread(uint64 field) +{ + uint64 value; + asm volatile("vmread %%rbx, %%rax" + : "=a"(value) + : "b"(field) + : "cc"); + return value; +} + +GUEST_CODE static inline void nested_vmptrld(uint64 cpu_id, uint64 vm_id) +{ + uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + uint8 error = 0; // nolint + asm volatile("vmptrld %1; setna %0" + : "=q"(error) + : "m"(vmcs_addr) + : "memory", "cc"); + if (error) + guest_uexit(0xE2BAD2); +} + +GUEST_CODE static noinline void vmcb_write16(uint64 vmcb, uint16 offset, uint16 val) +{ + *((volatile uint16*)(vmcb + offset)) = val; +} + +GUEST_CODE static noinline void vmcb_write32(uint64 vmcb, uint16 offset, uint32 val) +{ + *((volatile uint32*)(vmcb + offset)) = val; +} + +GUEST_CODE static noinline void vmcb_write64(uint64 vmcb, uint16 offset, uint64 val) +{ + *((volatile uint64*)(vmcb + offset)) = val; +} + +GUEST_CODE static noinline uint64 vmcb_read64(volatile uint8* vmcb, uint16 offset) +{ + return *((volatile uint64*)(vmcb + offset)); +} + +GUEST_CODE static void guest_memset(void* s, uint8 c, int size) +{ + volatile uint8* p = (volatile uint8*)s; + for (int i = 0; i < size; i++) + p[i] = c; +} + +GUEST_CODE static void guest_memcpy(void* dst, void* src, int size) +{ + volatile uint8* d = (volatile uint8*)dst; + volatile uint8* s = (volatile uint8*)src; + for (int i = 0; i < size; i++) + d[i] = s[i]; +} + +GUEST_CODE static noinline void +nested_enable_vmx_intel(uint64 cpu_id) +{ + uint64 vmxon_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id); + uint64 cr4 = read_cr4(); + cr4 |= X86_CR4_VMXE; + write_cr4(cr4); + + uint64 feature_control = rdmsr(X86_MSR_IA32_FEATURE_CONTROL); + // Check if Lock bit (bit 0) is clear. + if ((feature_control & 1) == 0) { + // If unlocked, set Lock bit (bit 0) and Enable VMX outside SMX bit (bit 2). + feature_control |= 0b101; + asm volatile("wrmsr" : : "d"(0x0), "c"(X86_MSR_IA32_FEATURE_CONTROL), "A"(feature_control)); + } + + // Store revision ID at the beginning of VMXON. + *(uint32*)vmxon_addr = rdmsr(X86_MSR_IA32_VMX_BASIC); + uint8 error; + // Can't use enter_vmx_operation() yet, because VMCS is not valid. + asm volatile("vmxon %1; setna %0" + : "=q"(error) + : "m"(vmxon_addr) + : "memory", "cc"); + if (error) { + guest_uexit(0xE2BAD0); + return; + } +} + +GUEST_CODE static noinline void +nested_enable_svm_amd(uint64 cpu_id) +{ + // Get the Host Save Area (HSAVE) physical address for this CPU. + // The HSAVE area stores the host processor's state on VMRUN and is restored on VMEXIT. + uint64 hsave_addr = X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id); + + // Set the SVM Enable (SVME) bit in EFER. This enables SVM operations. + uint64 efer = rdmsr(X86_MSR_IA32_EFER); + efer |= X86_EFER_SVME; + wrmsr(X86_MSR_IA32_EFER, efer); + + // Write the physical address of the HSAVE area to the VM_HSAVE_PA MSR. + // This MSR tells the CPU where to save/restore host state during VMRUN/VMEXIT. + wrmsr(X86_MSR_VM_HSAVE_PA, hsave_addr); +} + +GUEST_CODE static noinline void +guest_handle_enable_nested(struct api_call_1* cmd, uint64 cpu_id) +{ + if (get_cpu_vendor() == CPU_VENDOR_INTEL) { + nested_enable_vmx_intel(cpu_id); + } else { + nested_enable_svm_amd(cpu_id); + } +} + +GUEST_CODE static noinline void setup_l2_page_tables(cpu_vendor_id vendor, uint64 cpu_id, uint64 vm_id) +{ + uint64 l2_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); + uint64 l2_pdpt_addr = l2_pml4_addr + KVM_PAGE_SIZE; + uint64 l2_pd_addr = l2_pml4_addr + 2 * KVM_PAGE_SIZE; + uint64 l2_pt_addr = l2_pml4_addr + 3 * KVM_PAGE_SIZE; + + volatile uint64* pml4 = (volatile uint64*)l2_pml4_addr; + volatile uint64* pdpt = (volatile uint64*)l2_pdpt_addr; + volatile uint64* pd = (volatile uint64*)l2_pd_addr; + volatile uint64* pt = (volatile uint64*)l2_pt_addr; + + guest_memset((void*)l2_pml4_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)l2_pdpt_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)l2_pd_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)l2_pt_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)X86_SYZOS_ADDR_MSR_BITMAP(cpu_id, vm_id), 0, KVM_PAGE_SIZE); + + // Intel EPT: set Read, Write, Execute. + // AMD NPT: set Present, Write, User. + uint64 flags = X86_PDE64_PRESENT | X86_PDE64_RW | X86_PDE64_USER; + // Create the 4-level page table entries using 4KB pages: + // PML4[0] -> points to PDPT + pml4[0] = l2_pdpt_addr | flags; + // PDPT[0] -> points to Page Directory (PD) + pdpt[0] = l2_pd_addr | flags; + // PD[0] -> points to Page Table (PT) (NO X86_PDE64_PS) + pd[0] = l2_pt_addr | flags; + // PT[0..511] -> maps 512 4KB pages (2MB total) identity + uint64 pt_flags = flags; + if (vendor == CPU_VENDOR_INTEL) { + pt_flags |= EPT_MEMTYPE_WB | EPT_ACCESSED | EPT_DIRTY; + } else { + pt_flags |= X86_PDE64_ACCESSED | X86_PDE64_DIRTY; + } + for (int i = 0; i < 512; i++) + pt[i] = (i * KVM_PAGE_SIZE) | pt_flags; +} + +GUEST_CODE static noinline void init_vmcs_control_fields(uint64 cpu_id, uint64 vm_id) +{ + // Read and write Pin-Based controls from TRUE MSR. + uint64 vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS); + vmwrite(VMCS_PIN_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr); + + // Setup Secondary Processor-Based controls: enable EPT. + vmx_msr = rdmsr(X86_MSR_IA32_VMX_PROCBASED_CTLS2); + uint32 sec_exec_ctl = (uint32)(vmx_msr >> 32); // Must-be-1 bits. + sec_exec_ctl |= ((uint32)vmx_msr & SECONDARY_EXEC_ENABLE_EPT); // Allowed bits. + vmwrite(VMCS_SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl); + + // Read and write Primary Processor-Based controls from TRUE MSR. + // We also add the bit to enable the secondary controls. + vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS); + vmwrite(VMCS_CPU_BASED_VM_EXEC_CONTROL, (uint32)vmx_msr | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | CPU_BASED_HLT_EXITING); + + // Set up VM-Exit controls via TRUE MSR: indicate a 64-bit host. + vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS); + vmwrite(VMCS_VM_EXIT_CONTROLS, (uint32)vmx_msr | VM_EXIT_HOST_ADDR_SPACE_SIZE); + // Read and write VM-Entry controls from TRUE MSR + // We add the bit to indicate a 64-bit guest. + vmx_msr = rdmsr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS); + vmwrite(VMCS_VM_ENTRY_CONTROLS, (uint32)vmx_msr | VM_ENTRY_IA32E_MODE); + + // Set up the EPT Pointer. + // We use the L2 PML4 address we calculate in guest_handle_create_nested_vm. + // The EPT Pointer has: + // - Memory Type = 6 (Write-Back) + // - Page-Walk Length = 3 (meaning 4 levels: PML4, PDPT, PD, PT) + // - Address of the PML4 table + uint64 eptp = (X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id) & ~0xFFF) | (6 << 0) | (3 << 3); + vmwrite(VMCS_EPT_POINTER, eptp); + + // Set CR0/CR4 masks and shadows. + // This simple setup (masks=0) means any guest CR0/CR4 write is allowed + // and won't cause a VM-Exit. + vmwrite(VMCS_CR0_GUEST_HOST_MASK, 0); + vmwrite(VMCS_CR4_GUEST_HOST_MASK, 0); + vmwrite(VMCS_CR0_READ_SHADOW, read_cr0()); + vmwrite(VMCS_CR4_READ_SHADOW, read_cr4()); + + // Disable the bitmaps which we do not use. + vmwrite(VMCS_MSR_BITMAP, 0); + vmwrite(VMCS_VMREAD_BITMAP, 0); + vmwrite(VMCS_VMWRITE_BITMAP, 0); + + // Intercept #UD (Invalid Opcode) + vmwrite(VMCS_EXCEPTION_BITMAP, (1 << 6)); + + // Clear unused/unsupported fields. + // TODO(glider): do we need these? + vmwrite(VMCS_VIRTUAL_PROCESSOR_ID, 0); + vmwrite(VMCS_POSTED_INTR_NV, 0); + vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MASK, 0); + vmwrite(VMCS_PAGE_FAULT_ERROR_CODE_MATCH, -1); + vmwrite(VMCS_CR3_TARGET_COUNT, 0); + vmwrite(VMCS_VM_EXIT_MSR_STORE_COUNT, 0); + vmwrite(VMCS_VM_EXIT_MSR_LOAD_COUNT, 0); + vmwrite(VMCS_VM_ENTRY_MSR_LOAD_COUNT, 0); + vmwrite(VMCS_VM_ENTRY_INTR_INFO_FIELD, 0); + vmwrite(VMCS_TPR_THRESHOLD, 0); +} + +// Common L2 exit reasons for Intel and AMD. +typedef enum { + SYZOS_NESTED_EXIT_REASON_HLT = 1, + SYZOS_NESTED_EXIT_REASON_UNKNOWN = 0xFF, +} syz_nested_exit_reason; + +GUEST_CODE static void guest_uexit_l2(uint64 exit_reason, syz_nested_exit_reason mapped_reason, + cpu_vendor_id vendor) +{ + if (mapped_reason != SYZOS_NESTED_EXIT_REASON_UNKNOWN) { + guest_uexit(0xe2e20000 | mapped_reason); + } else if (vendor == CPU_VENDOR_INTEL) { + guest_uexit(0xe2110000 | exit_reason); + } else { + guest_uexit(0xe2aa0000 | exit_reason); + } +} + +GUEST_CODE static syz_nested_exit_reason map_intel_exit_reason(uint64 reason) +{ + volatile uint64 basic_reason = reason & 0xFFFF; + // EXIT_REASON_HLT. + if (basic_reason == 0xc) + return SYZOS_NESTED_EXIT_REASON_HLT; + return SYZOS_NESTED_EXIT_REASON_UNKNOWN; +} + +// This function is called from inline assembly. +__attribute__((used)) +GUEST_CODE static void +nested_vm_exit_handler_intel(uint64 exit_reason, struct l2_guest_regs* regs) +{ + syz_nested_exit_reason mapped_reason = map_intel_exit_reason(exit_reason); + guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_INTEL); +} + +extern char after_vmentry_label; +__attribute__((naked)) GUEST_CODE static void nested_vm_exit_handler_intel_asm(void) +{ + asm volatile(R"( + // Save L2's GPRs. This creates the 'struct l2_guest_regs' on the stack. + // The order MUST match the struct. + push %%rax + push %%rbx + push %%rcx + push %%rdx + push %%rsi + push %%rdi + push %%rbp + push %%r8 + push %%r9 + push %%r10 + push %%r11 + push %%r12 + push %%r13 + push %%r14 + push %%r15 + + // Prepare arguments for the C handler: + // arg1 (RDI) = exit_reason + // arg2 (RSI) = pointer to the saved registers + mov %%rsp, %%rsi + mov %[vm_exit_reason], %%rbx + vmread %%rbx, %%rdi + + // Call the C handler. + call nested_vm_exit_handler_intel + + // The C handler has processed the exit. Now, return to the L1 command + // processing loop. VMX remains enabled. + add %[stack_cleanup_size], %%rsp + + // Jump to L1 main flow + jmp after_vmentry_label + )" + + : : [stack_cleanup_size] "i"(sizeof(struct l2_guest_regs)), + [vm_exit_reason] "i"(VMCS_VM_EXIT_REASON) : "memory", "cc", "rbx", "rdi", "rsi"); +} + +GUEST_CODE static syz_nested_exit_reason map_amd_exit_reason(uint64 reason) +{ + volatile uint64 basic_reason = reason & 0xFFFF; + // #VMEXIT_HLT. + if (basic_reason == 0x78) + return SYZOS_NESTED_EXIT_REASON_HLT; + return SYZOS_NESTED_EXIT_REASON_UNKNOWN; +} + +__attribute__((used)) GUEST_CODE static void +nested_vm_exit_handler_amd(uint64 exit_reason, uint64 cpu_id, uint64 vm_id) +{ + syz_nested_exit_reason mapped_reason = map_amd_exit_reason(exit_reason); + guest_uexit_l2(exit_reason, mapped_reason, CPU_VENDOR_AMD); +} + +GUEST_CODE static noinline void init_vmcs_host_state(void) +{ + // Segment Selectors. + vmwrite(VMCS_HOST_CS_SELECTOR, X86_SYZOS_SEL_CODE); + vmwrite(VMCS_HOST_DS_SELECTOR, X86_SYZOS_SEL_DATA); + vmwrite(VMCS_HOST_ES_SELECTOR, X86_SYZOS_SEL_DATA); + vmwrite(VMCS_HOST_SS_SELECTOR, X86_SYZOS_SEL_DATA); + vmwrite(VMCS_HOST_FS_SELECTOR, X86_SYZOS_SEL_DATA); + vmwrite(VMCS_HOST_GS_SELECTOR, X86_SYZOS_SEL_DATA); + vmwrite(VMCS_HOST_TR_SELECTOR, X86_SYZOS_SEL_TSS64); + + // Base addresses. + vmwrite(VMCS_HOST_TR_BASE, 0); + vmwrite(VMCS_HOST_GDTR_BASE, X86_SYZOS_ADDR_GDT); + vmwrite(VMCS_HOST_IDTR_BASE, X86_SYZOS_ADDR_VAR_IDT); + vmwrite(VMCS_HOST_FS_BASE, rdmsr(X86_MSR_FS_BASE)); + vmwrite(VMCS_HOST_GS_BASE, rdmsr(X86_MSR_GS_BASE)); + + // RIP and RSP. + uint64 tmpreg = 0; // nolint + asm volatile("mov %%rsp, %0" : "=r"(tmpreg)); + vmwrite(VMCS_HOST_RSP, tmpreg); + vmwrite(VMCS_HOST_RIP, (uintptr_t)nested_vm_exit_handler_intel_asm); + + // Control Registers. + vmwrite(VMCS_HOST_CR0, read_cr0()); + vmwrite(VMCS_HOST_CR3, read_cr3()); + vmwrite(VMCS_HOST_CR4, read_cr4()); + + // MSRs. + vmwrite(VMCS_HOST_IA32_PAT, rdmsr(X86_MSR_IA32_CR_PAT)); + vmwrite(VMCS_HOST_IA32_EFER, rdmsr(X86_MSR_IA32_EFER)); + vmwrite(VMCS_HOST_IA32_PERF_GLOBAL_CTRL, rdmsr(X86_MSR_CORE_PERF_GLOBAL_CTRL)); + vmwrite(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(X86_MSR_IA32_SYSENTER_CS)); + vmwrite(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(X86_MSR_IA32_SYSENTER_ESP)); + vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(X86_MSR_IA32_SYSENTER_EIP)); +} + +#define COPY_VMCS_FIELD(GUEST_FIELD, HOST_FIELD) \ + vmwrite(GUEST_FIELD, vmread(HOST_FIELD)) + +#define SETUP_L2_SEGMENT(SEG, SELECTOR, BASE, LIMIT, AR) \ + vmwrite(VMCS_GUEST_##SEG##_SELECTOR, SELECTOR); \ + vmwrite(VMCS_GUEST_##SEG##_BASE, BASE); \ + vmwrite(VMCS_GUEST_##SEG##_LIMIT, LIMIT); \ + vmwrite(VMCS_GUEST_##SEG##_ACCESS_RIGHTS, AR); + +GUEST_CODE static noinline void init_vmcs_guest_state(uint64 cpu_id, uint64 vm_id) +{ + uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); + uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); + // Segment Registers. + SETUP_L2_SEGMENT(CS, vmread(VMCS_HOST_CS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_CODE); + SETUP_L2_SEGMENT(DS, vmread(VMCS_HOST_DS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); + SETUP_L2_SEGMENT(ES, vmread(VMCS_HOST_ES_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); + SETUP_L2_SEGMENT(SS, vmread(VMCS_HOST_SS_SELECTOR), 0, 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); + SETUP_L2_SEGMENT(FS, vmread(VMCS_HOST_FS_SELECTOR), vmread(VMCS_HOST_FS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); + SETUP_L2_SEGMENT(GS, vmread(VMCS_HOST_GS_SELECTOR), vmread(VMCS_HOST_GS_BASE), 0xFFFFFFFF, VMX_AR_64BIT_DATA_STACK); + + // Task and LDT Registers. + SETUP_L2_SEGMENT(TR, vmread(VMCS_HOST_TR_SELECTOR), vmread(VMCS_HOST_TR_BASE), 0x67, VMX_AR_TSS_BUSY); + SETUP_L2_SEGMENT(LDTR, 0, 0, 0, VMX_AR_LDTR_UNUSABLE); + + // Control Registers & CPU State. + vmwrite(VMCS_GUEST_CR0, vmread(VMCS_HOST_CR0)); + vmwrite(VMCS_GUEST_CR3, vmread(VMCS_HOST_CR3)); + vmwrite(VMCS_GUEST_CR4, vmread(VMCS_HOST_CR4)); + vmwrite(VMCS_GUEST_RIP, l2_code_addr); + vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); + vmwrite(VMCS_GUEST_RFLAGS, RFLAGS_1_BIT); + // TODO + vmwrite(VMCS_GUEST_DR7, 0x400); + + // MSRs - Copy from host or set to default. + COPY_VMCS_FIELD(VMCS_GUEST_IA32_EFER, VMCS_HOST_IA32_EFER); + COPY_VMCS_FIELD(VMCS_GUEST_IA32_PAT, VMCS_HOST_IA32_PAT); + COPY_VMCS_FIELD(VMCS_GUEST_IA32_PERF_GLOBAL_CTRL, VMCS_HOST_IA32_PERF_GLOBAL_CTRL); + COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_CS, VMCS_HOST_IA32_SYSENTER_CS); + COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_ESP, VMCS_HOST_IA32_SYSENTER_ESP); + COPY_VMCS_FIELD(VMCS_GUEST_SYSENTER_EIP, VMCS_HOST_IA32_SYSENTER_EIP); + vmwrite(VMCS_GUEST_IA32_DEBUGCTL, 0); + + // Descriptor Tables. + vmwrite(VMCS_GUEST_GDTR_BASE, vmread(VMCS_HOST_GDTR_BASE)); + vmwrite(VMCS_GUEST_GDTR_LIMIT, 0xffff); + vmwrite(VMCS_GUEST_IDTR_BASE, vmread(VMCS_HOST_IDTR_BASE)); + vmwrite(VMCS_GUEST_IDTR_LIMIT, 0xffff); + + // Miscellaneous Fields. + vmwrite(VMCS_LINK_POINTER, 0xffffffffffffffff); + // 0 = Active. + vmwrite(VMCS_GUEST_ACTIVITY_STATE, 0); + vmwrite(VMCS_GUEST_INTERRUPTIBILITY_INFO, 0); + vmwrite(VMCS_GUEST_PENDING_DBG_EXCEPTIONS, 0); + vmwrite(VMCS_VMX_PREEMPTION_TIMER_VALUE, 0); + vmwrite(VMCS_GUEST_INTR_STATUS, 0); + vmwrite(VMCS_GUEST_PML_INDEX, 0); +} + +GUEST_CODE static noinline void +nested_create_vm_intel(struct api_call_1* cmd, uint64 cpu_id) +{ + uint64 vm_id = cmd->arg; + uint64 vmcs_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + uint8 error = 0; // nolint + + *(uint32*)vmcs_addr = rdmsr(X86_MSR_IA32_VMX_BASIC); + asm volatile("vmclear %1; setna %0" + : "=q"(error) + : "m"(vmcs_addr) + : "memory", "cc"); + if (error) { + guest_uexit(0xE2BAD1); + return; + } + nested_vmptrld(cpu_id, vm_id); + + setup_l2_page_tables(CPU_VENDOR_INTEL, cpu_id, vm_id); + init_vmcs_control_fields(cpu_id, vm_id); + init_vmcs_host_state(); + init_vmcs_guest_state(cpu_id, vm_id); +} + +// Helper for setting up a segment in the VMCB +#define SETUP_L2_SEGMENT_SVM(VMBC_PTR, SEG_NAME, SELECTOR, BASE, LIMIT, ATTR) \ + vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_SEL, SELECTOR); \ + vmcb_write16(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_ATTR, ATTR); \ + vmcb_write32(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_LIM, LIMIT); \ + vmcb_write64(VMBC_PTR, VMCB_GUEST_##SEG_NAME##_BASE, BASE); + +GUEST_CODE static noinline void init_vmcb_guest_state(uint64 cpu_id, uint64 vm_id) +{ + uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); + uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); + uint64 npt_pml4_addr = X86_SYZOS_ADDR_VM_PGTABLE(cpu_id, vm_id); + // Setup Guest Segment Registers. + // We copy the L1 guest's segment setup, as it's a good 64-bit environment. + SETUP_L2_SEGMENT_SVM(vmcb_addr, CS, X86_SYZOS_SEL_CODE, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_CODE); + SETUP_L2_SEGMENT_SVM(vmcb_addr, DS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); + SETUP_L2_SEGMENT_SVM(vmcb_addr, ES, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); + SETUP_L2_SEGMENT_SVM(vmcb_addr, SS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); + SETUP_L2_SEGMENT_SVM(vmcb_addr, FS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); + SETUP_L2_SEGMENT_SVM(vmcb_addr, GS, X86_SYZOS_SEL_DATA, 0, 0xFFFFFFFF, SVM_ATTR_64BIT_DATA); + + // Task Register (TR). Must point to a valid, present, 64-bit TSS. + SETUP_L2_SEGMENT_SVM(vmcb_addr, TR, X86_SYZOS_SEL_TSS64, X86_SYZOS_ADDR_VAR_TSS, 0x67, VMX_AR_TSS_AVAILABLE); + + // LDT Register (LDTR) - Mark as unusable. + // A null selector and attribute is the correct way to disable LDTR. + SETUP_L2_SEGMENT_SVM(vmcb_addr, LDTR, 0, 0, 0, SVM_ATTR_LDTR_UNUSABLE); + + // Setup Guest Control Registers & CPU State. + uint64 efer = rdmsr(X86_MSR_IA32_EFER); + vmcb_write64(vmcb_addr, VMCB_GUEST_CR0, read_cr0() | X86_CR0_WP); + // L2 will use L1's page tables. + vmcb_write64(vmcb_addr, VMCB_GUEST_CR3, read_cr3()); + vmcb_write64(vmcb_addr, VMCB_GUEST_CR4, read_cr4()); + vmcb_write64(vmcb_addr, VMCB_GUEST_RIP, l2_code_addr); + vmcb_write64(vmcb_addr, VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); + vmcb_write64(vmcb_addr, VMCB_GUEST_RFLAGS, RFLAGS_1_BIT); + + // Setup Guest MSRs. + + // SYSCALL/SYSRET MSRs. + vmcb_write64(vmcb_addr, VMCB_GUEST_DEBUGCTL, 0); + vmcb_write64(vmcb_addr, VMCB_GUEST_DR6, 0x0); + vmcb_write64(vmcb_addr, VMCB_GUEST_DR7, 0x0); + + vmcb_write64(vmcb_addr, VMCB_GUEST_EFER, efer & ~X86_EFER_SCE); + vmcb_write64(vmcb_addr, VMCB_GUEST_PAT, rdmsr(X86_MSR_IA32_CR_PAT)); + + // Setup Guest Descriptor Tables. + struct { + uint16 limit; + uint64 base; + } __attribute__((packed)) gdtr, idtr; + asm volatile("sgdt %0" : "=m"(gdtr)); + asm volatile("sidt %0" : "=m"(idtr)); + vmcb_write64(vmcb_addr, VMCB_GUEST_GDTR_BASE, gdtr.base); + vmcb_write32(vmcb_addr, VMCB_GUEST_GDTR_LIM, gdtr.limit); + vmcb_write64(vmcb_addr, VMCB_GUEST_IDTR_BASE, idtr.base); + vmcb_write32(vmcb_addr, VMCB_GUEST_IDTR_LIM, idtr.limit); + + // Setup VMCB Control Fields. + vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC3, VMCB_CTRL_INTERCEPT_HLT); + vmcb_write32(vmcb_addr, VMCB_CTRL_INTERCEPT_VEC4, VMCB_CTRL_INTERCEPT_VEC4_ALL); + + // Enable Nested Paging (NPT): + // Write '1' to the NPT Enable field (0x090). + vmcb_write64(vmcb_addr, VMCB_CTRL_NP_ENABLE, (1 << VMCB_CTRL_NPT_ENABLE_BIT)); + + // 2Write the NPT root address to N_CR3 (0x098) + // Unlike Intel's EPTP, AMD's N_CR3 field is *only* the + // 4K-aligned physical address of the PML4 table. + // It does not contain any control bits. + uint64 npt_pointer = (npt_pml4_addr & ~0xFFF); + vmcb_write64(vmcb_addr, VMCB_CTRL_N_CR3, npt_pointer); + + // Set Guest ASID. + vmcb_write32(vmcb_addr, VMCB_CTRL_ASID, 1); +} + +GUEST_CODE static noinline void +nested_create_vm_amd(struct api_call_1* cmd, uint64 cpu_id) +{ + uint64 vm_id = cmd->arg; + uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + + guest_memset((void*)vmcb_addr, 0, KVM_PAGE_SIZE); + guest_memset((void*)X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu_id), 0, KVM_PAGE_SIZE); + + // Setup NPT (Nested Page Tables) + setup_l2_page_tables(CPU_VENDOR_AMD, cpu_id, vm_id); + + // Initialize VMCB Control and Guest State + init_vmcb_guest_state(cpu_id, vm_id); +} + +GUEST_CODE static noinline void +guest_handle_nested_create_vm(struct api_call_1* cmd, uint64 cpu_id) +{ + if (get_cpu_vendor() == CPU_VENDOR_INTEL) { + nested_create_vm_intel(cmd, cpu_id); + } else { + nested_create_vm_amd(cmd, cpu_id); + } +} + +GUEST_CODE static noinline void +guest_handle_nested_load_code(struct api_call_nested_load_code* cmd, uint64 cpu_id) +{ + uint64 vm_id = cmd->vm_id; + uint64 l2_code_addr = X86_SYZOS_ADDR_VM_CODE(cpu_id, vm_id); + uint64 l2_stack_addr = X86_SYZOS_ADDR_VM_STACK(cpu_id, vm_id); + // Code size = command size - header size - vm_id size. + uint64 l2_code_size = cmd->header.size - sizeof(struct api_call_header) - sizeof(uint64); + if (l2_code_size > KVM_PAGE_SIZE) + l2_code_size = KVM_PAGE_SIZE; + guest_memcpy((void*)l2_code_addr, (void*)cmd->insns, + l2_code_size); + if (get_cpu_vendor() == CPU_VENDOR_INTEL) { + nested_vmptrld(cpu_id, vm_id); + vmwrite(VMCS_GUEST_RIP, l2_code_addr); + vmwrite(VMCS_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); + } else { + vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RIP, l2_code_addr); + vmcb_write64(X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id), VMCB_GUEST_RSP, l2_stack_addr + KVM_PAGE_SIZE - 8); + } +} + +GUEST_CODE static noinline void +guest_handle_nested_vmentry_intel(struct api_call_1* cmd, uint64 cpu_id, bool is_launch) +{ + uint64 vm_id = cmd->arg; + uint64 vmx_error_code = 0; + uint8 fail_flag = 0; // Will be 1 if EITHER CF or ZF is set + + nested_vmptrld(cpu_id, vm_id); + + if (is_launch) { + asm volatile(R"( + // Attempt to launch the L2 guest. + vmlaunch + // Set AL to 1 if CF=1 (VMfailValid) + setc %%al + // Set BL to 1 if ZF=1 (VMfailInvalid) + setz %%bl + or %%bl, %%al)" + : "=a"(fail_flag) + : + : "rbx", "cc", "memory"); + } else { + asm volatile(R"( + // Attempt to resume the L2 guest. + vmresume + // Set AL to 1 if CF=1 (VMfailValid) + setc %%al + // Set BL to 1 if ZF=1 (VMfailInvalid) + setz %%bl + or %%bl, %%al)" + : "=a"(fail_flag) + : + : "rbx", "cc", "memory"); + } + asm volatile(".globl after_vmentry_label\nafter_vmentry_label:"); + if (fail_flag) { + // VMLAUNCH/VMRESUME failed, so VMCS is still valid and can be read. + vmx_error_code = vmread(VMCS_VM_INSTRUCTION_ERROR); + guest_uexit(0xE2E10000 | (uint32)vmx_error_code); + } else { + // This path is only taken if VMLAUNCH/VMRESUME truly succeeded (CF=0 and ZF=0) + // and the L2 guest has run and exited. + guest_uexit(UEXIT_STOP_L2); + } +} + +GUEST_CODE static noinline void +guest_run_amd_vm(uint64 cpu_id, uint64 vm_id) +{ + uint64 vmcb_addr = X86_SYZOS_ADDR_VMCS_VMCB(cpu_id, vm_id); + volatile uint8* vmcb_ptr = (volatile uint8*)vmcb_addr; + uint8 fail_flag = 0; + + asm volatile( + "mov %1, %%rax\n\t" // Load VMCB physical address into RAX + "vmrun\n\t" // Launch or resume L2 guest + "setc %0\n\t" + : "=q"(fail_flag) + : "m"(vmcb_addr) + : "rax", "cc", "memory"); + + if (fail_flag) { + // VMRUN failed. + guest_uexit(0xE2E10000 | 0xFFFF); + return; + } + + // VMRUN succeeded and we have a VM-exit. + uint64 exit_reason = vmcb_read64(vmcb_ptr, VMCB_EXIT_CODE); + nested_vm_exit_handler_amd(exit_reason, cpu_id, vm_id); + guest_uexit(UEXIT_STOP_L2); +} + +GUEST_CODE static noinline void +guest_handle_nested_vmlaunch_amd(struct api_call_1* cmd, uint64 cpu_id, uint64 vm_id) +{ + guest_run_amd_vm(cpu_id, vm_id); +} + +GUEST_CODE static noinline void +guest_handle_nested_vmlaunch(struct api_call_1* cmd, uint64 cpu_id) +{ + uint64 vm_id = cmd->arg; + if (get_cpu_vendor() == CPU_VENDOR_INTEL) { + guest_handle_nested_vmentry_intel(cmd, cpu_id, true); + } else { + guest_handle_nested_vmlaunch_amd(cmd, cpu_id, vm_id); + } +} + #endif // EXECUTOR_COMMON_KVM_AMD64_SYZOS_H diff --git a/executor/kvm.h b/executor/kvm.h index fb7b5b49dc7c..53ba0088854f 100644 --- a/executor/kvm.h +++ b/executor/kvm.h @@ -48,6 +48,7 @@ // Pool of 32 pages for dynamic PT/PD allocations. #define X86_SYZOS_ADDR_PT_POOL 0x5000 #define X86_SYZOS_ADDR_VAR_IDT 0x25000 +#define X86_SYZOS_ADDR_VAR_TSS 0x26000 #define X86_SYZOS_ADDR_SMRAM 0x30000 // Write to this page to trigger a page fault and stop KVM_RUN. @@ -59,14 +60,72 @@ // Location of the SYZOS guest code. Name shared with ARM64 SYZOS. #define SYZOS_ADDR_EXECUTOR_CODE 0x54000 #define X86_SYZOS_ADDR_SCRATCH_CODE 0x58000 -#define X86_SYZOS_ADDR_STACK_BOTTOM 0x90000 -#define X86_SYZOS_ADDR_STACK0 0x90f80 +#define X86_SYZOS_ADDR_STACK_BOTTOM 0x60000 +#define X86_SYZOS_ADDR_STACK0 0x60f80 + +// Base address for all per-L1-VCPU regions. +#define X86_SYZOS_PER_VCPU_REGIONS_BASE 0x70000 +// Size of the entire memory block allocated for a single L1 VCPU to manage its L2 VMs. +// We need space for 1 VMXON page + 4 L2 VMs. Let's allocate 256KB per L1 VCPU for ample space. +#define X86_SYZOS_L1_VCPU_REGION_SIZE 0x40000 + +// Offsets within a single L1 VCPU's region. + +// Shared data for the L1 VCPU itself: 1 page for VMXON/HSAVE +#define X86_SYZOS_L1_VCPU_OFFSET_VM_ARCH_SPECIFIC 0x0000 +// Base offset for the area containing the 4 L2 VM slots. +#define X86_SYZOS_L1_VCPU_OFFSET_L2_VMS_AREA 0x1000 + +// Layout of a single L2 VM's data block. + +// Size of the memory block for a single L2 VM. +#define X86_SYZOS_L2_VM_REGION_SIZE 0x8000 + +// Offsets within a single L2 VM's region. +#define X86_SYZOS_L2_VM_OFFSET_VMCS_VMCB 0x0000 +#define X86_SYZOS_L2_VM_OFFSET_VM_STACK 0x1000 +#define X86_SYZOS_L2_VM_OFFSET_VM_CODE 0x2000 +// 4 pages for L2 EPT/NPT. +#define X86_SYZOS_L2_VM_OFFSET_VM_PGTABLE 0x3000 +#define X86_SYZOS_L2_VM_OFFSET_MSR_BITMAP 0x7000 + +// Subsequent addresses are shifted to accommodate all L1 VCPU regions. #define X86_SYZOS_ADDR_UNUSED 0x200000 #define X86_SYZOS_ADDR_IOAPIC 0xfec00000 +#define X86_SYZOS_ADDR_VMCS_VMCB(cpu, vm) \ + (X86_SYZOS_PER_VCPU_REGIONS_BASE + (cpu) * X86_SYZOS_L1_VCPU_REGION_SIZE + \ + X86_SYZOS_L1_VCPU_OFFSET_L2_VMS_AREA + (vm) * X86_SYZOS_L2_VM_REGION_SIZE + \ + X86_SYZOS_L2_VM_OFFSET_VMCS_VMCB) + +#define X86_SYZOS_ADDR_VM_CODE(cpu, vm) \ + (X86_SYZOS_PER_VCPU_REGIONS_BASE + (cpu) * X86_SYZOS_L1_VCPU_REGION_SIZE + \ + X86_SYZOS_L1_VCPU_OFFSET_L2_VMS_AREA + (vm) * X86_SYZOS_L2_VM_REGION_SIZE + \ + X86_SYZOS_L2_VM_OFFSET_VM_CODE) + +#define X86_SYZOS_ADDR_VM_STACK(cpu, vm) \ + (X86_SYZOS_PER_VCPU_REGIONS_BASE + (cpu) * X86_SYZOS_L1_VCPU_REGION_SIZE + \ + X86_SYZOS_L1_VCPU_OFFSET_L2_VMS_AREA + (vm) * X86_SYZOS_L2_VM_REGION_SIZE + \ + X86_SYZOS_L2_VM_OFFSET_VM_STACK) + +#define X86_SYZOS_ADDR_VM_PGTABLE(cpu, vm) \ + (X86_SYZOS_PER_VCPU_REGIONS_BASE + (cpu) * X86_SYZOS_L1_VCPU_REGION_SIZE + \ + X86_SYZOS_L1_VCPU_OFFSET_L2_VMS_AREA + (vm) * X86_SYZOS_L2_VM_REGION_SIZE + \ + X86_SYZOS_L2_VM_OFFSET_VM_PGTABLE) + +#define X86_SYZOS_ADDR_MSR_BITMAP(cpu, vm) \ + (X86_SYZOS_PER_VCPU_REGIONS_BASE + (cpu) * X86_SYZOS_L1_VCPU_REGION_SIZE + \ + X86_SYZOS_L1_VCPU_OFFSET_L2_VMS_AREA + (vm) * X86_SYZOS_L2_VM_REGION_SIZE + \ + X86_SYZOS_L2_VM_OFFSET_MSR_BITMAP) + +#define X86_SYZOS_ADDR_VM_ARCH_SPECIFIC(cpu) \ + (X86_SYZOS_PER_VCPU_REGIONS_BASE + (cpu) * X86_SYZOS_L1_VCPU_REGION_SIZE + \ + X86_SYZOS_L1_VCPU_OFFSET_VM_ARCH_SPECIFIC) + // SYZOS segment selectors #define X86_SYZOS_SEL_CODE 0x8 #define X86_SYZOS_SEL_DATA 0x10 +#define X86_SYZOS_SEL_TSS64 0x18 #define X86_CR0_PE 1ULL #define X86_CR0_MP (1ULL << 1) @@ -125,6 +184,11 @@ #define X86_PDE64_PS (1ULL << 7) #define X86_PDE64_G (1ULL << 8) +// Intel-specific EPT Flags. +#define EPT_MEMTYPE_WB (6ULL << 3) +#define EPT_ACCESSED (1ULL << 8) +#define EPT_DIRTY (1ULL << 9) + #define X86_SEL_LDT (1 << 3) #define X86_SEL_CS16 (2 << 3) #define X86_SEL_DS16 (3 << 3) @@ -156,16 +220,269 @@ #define X86_SEL_TSS64_CPL3 ((29 << 3) + 3) #define X86_SEL_TSS64_CPL3_HI (30 << 3) +// Model-Specific Registers (MSRs). #define X86_MSR_IA32_FEATURE_CONTROL 0x3a #define X86_MSR_IA32_VMX_BASIC 0x480 #define X86_MSR_IA32_SMBASE 0x9e #define X86_MSR_IA32_SYSENTER_CS 0x174 #define X86_MSR_IA32_SYSENTER_ESP 0x175 #define X86_MSR_IA32_SYSENTER_EIP 0x176 +#define X86_MSR_IA32_CR_PAT 0x277 +#define X86_MSR_CORE_PERF_GLOBAL_CTRL 0x38f +#define X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS 0x48d +#define X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x48e +#define X86_MSR_IA32_VMX_TRUE_EXIT_CTLS 0x48f +#define X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS 0x490 +#define X86_MSR_IA32_EFER 0xc0000080 #define X86_MSR_IA32_STAR 0xC0000081 #define X86_MSR_IA32_LSTAR 0xC0000082 +#define X86_MSR_FS_BASE 0xc0000100 +#define X86_MSR_GS_BASE 0xc0000101 +#define X86_MSR_VM_HSAVE_PA 0xc0010117 #define X86_MSR_IA32_VMX_PROCBASED_CTLS2 0x48B +// VMX control bits +#define RFLAGS_1_BIT (1ULL << 1) +#define CPU_BASED_HLT_EXITING (1U << 7) +#define AR_TSS_AVAILABLE 0x0089 +#define SVM_ATTR_LDTR_UNUSABLE 0x0000 +#define VMX_AR_TSS_BUSY 0x008b +#define VMX_AR_TSS_AVAILABLE 0x0089 +#define VMX_AR_LDTR_UNUSABLE 0x10000 +#define VM_ENTRY_IA32E_MODE (1U << 9) +#define SECONDARY_EXEC_ENABLE_EPT (1U << 1) +#define VM_EXIT_HOST_ADDR_SPACE_SIZE (1U << 9) +#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS (1U << 31) + +#define VMX_ACCESS_RIGHTS_P (1 << 7) +#define VMX_ACCESS_RIGHTS_S (1 << 4) +#define VMX_ACCESS_RIGHTS_TYPE_A (1 << 0) +#define VMX_ACCESS_RIGHTS_TYPE_RW (1 << 1) +#define VMX_ACCESS_RIGHTS_TYPE_E (1 << 3) +#define VMX_ACCESS_RIGHTS_G (1 << 15) +#define VMX_ACCESS_RIGHTS_DB (1 << 14) +#define VMX_ACCESS_RIGHTS_L (1 << 13) + +// This is a 64-bit data/stack segment: +// P=1, S=1, Type=3 (RW+Accessed), G=1, DB=1, L=0 +#define VMX_AR_64BIT_DATA_STACK (VMX_ACCESS_RIGHTS_P | VMX_ACCESS_RIGHTS_S | \ + VMX_ACCESS_RIGHTS_TYPE_RW | VMX_ACCESS_RIGHTS_TYPE_A | \ + VMX_ACCESS_RIGHTS_G | VMX_ACCESS_RIGHTS_DB) + +// This is a 64-bit code segment: +// P=1, S=1, Type=11 (Exec/Read+Accessed), G=1, DB=0, L=1 +#define VMX_AR_64BIT_CODE (VMX_ACCESS_RIGHTS_P | VMX_ACCESS_RIGHTS_S | \ + VMX_ACCESS_RIGHTS_TYPE_E | VMX_ACCESS_RIGHTS_TYPE_RW | \ + VMX_ACCESS_RIGHTS_TYPE_A | VMX_ACCESS_RIGHTS_G | \ + VMX_ACCESS_RIGHTS_L) + +// VMCS Control Fields. +#define VMCS_VIRTUAL_PROCESSOR_ID 0x00000000 +#define VMCS_POSTED_INTR_NV 0x00000002 +#define VMCS_MSR_BITMAP 0x00002004 +#define VMCS_VMREAD_BITMAP 0x00002006 +#define VMCS_VMWRITE_BITMAP 0x00002008 +#define VMCS_EPT_POINTER 0x0000201a +#define VMCS_LINK_POINTER 0x00002800 +#define VMCS_PIN_BASED_VM_EXEC_CONTROL 0x00004000 +#define VMCS_CPU_BASED_VM_EXEC_CONTROL 0x00004002 +#define VMCS_EXCEPTION_BITMAP 0x00004004 +#define VMCS_PAGE_FAULT_ERROR_CODE_MASK 0x00004006 +#define VMCS_PAGE_FAULT_ERROR_CODE_MATCH 0x00004008 +#define VMCS_CR3_TARGET_COUNT 0x0000400a +#define VMCS_VM_EXIT_CONTROLS 0x0000400c +#define VMCS_VM_EXIT_MSR_STORE_COUNT 0x0000400e +#define VMCS_VM_EXIT_MSR_LOAD_COUNT 0x00004010 +#define VMCS_VM_ENTRY_CONTROLS 0x00004012 +#define VMCS_VM_ENTRY_MSR_LOAD_COUNT 0x00004014 +#define VMCS_VM_ENTRY_INTR_INFO_FIELD 0x00004016 +#define VMCS_TPR_THRESHOLD 0x0000401c +#define VMCS_SECONDARY_VM_EXEC_CONTROL 0x0000401e +#define VMCS_VM_INSTRUCTION_ERROR 0x00004400 +#define VMCS_VM_EXIT_REASON 0x00004402 +#define VMCS_VMX_PREEMPTION_TIMER_VALUE 0x0000482e +#define VMCS_CR0_GUEST_HOST_MASK 0x00006000 +#define VMCS_CR4_GUEST_HOST_MASK 0x00006002 +#define VMCS_CR0_READ_SHADOW 0x00006004 +#define VMCS_CR4_READ_SHADOW 0x00006006 + +// VMCS Host State Fields. +#define VMCS_HOST_ES_SELECTOR 0x00000c00 +#define VMCS_HOST_CS_SELECTOR 0x00000c02 +#define VMCS_HOST_SS_SELECTOR 0x00000c04 +#define VMCS_HOST_DS_SELECTOR 0x00000c06 +#define VMCS_HOST_FS_SELECTOR 0x00000c08 +#define VMCS_HOST_GS_SELECTOR 0x00000c0a +#define VMCS_HOST_TR_SELECTOR 0x00000c0c +#define VMCS_HOST_IA32_PAT 0x00002c00 +#define VMCS_HOST_IA32_EFER 0x00002c02 +#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002c04 +#define VMCS_HOST_IA32_SYSENTER_CS 0x00004c00 +#define VMCS_HOST_CR0 0x00006c00 +#define VMCS_HOST_CR3 0x00006c02 +#define VMCS_HOST_CR4 0x00006c04 +#define VMCS_HOST_FS_BASE 0x00006c06 +#define VMCS_HOST_GS_BASE 0x00006c08 +#define VMCS_HOST_TR_BASE 0x00006c0a +#define VMCS_HOST_GDTR_BASE 0x00006c0c +#define VMCS_HOST_IDTR_BASE 0x00006c0e +#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006c10 +#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006c12 +#define VMCS_HOST_RSP 0x00006c14 +#define VMCS_HOST_RIP 0x00006c16 + +// VMCS Guest State Fields. +#define VMCS_GUEST_INTR_STATUS 0x00000810 +#define VMCS_GUEST_PML_INDEX 0x00000812 +#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802 +#define VMCS_GUEST_IA32_PAT 0x00002804 +#define VMCS_GUEST_IA32_EFER 0x00002806 +#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808 +#define VMCS_GUEST_ES_SELECTOR 0x00000800 +#define VMCS_GUEST_CS_SELECTOR 0x00000802 +#define VMCS_GUEST_SS_SELECTOR 0x00000804 +#define VMCS_GUEST_DS_SELECTOR 0x00000806 +#define VMCS_GUEST_FS_SELECTOR 0x00000808 +#define VMCS_GUEST_GS_SELECTOR 0x0000080a +#define VMCS_GUEST_LDTR_SELECTOR 0x0000080c +#define VMCS_GUEST_TR_SELECTOR 0x0000080e +#define VMCS_GUEST_ES_LIMIT 0x00004800 +#define VMCS_GUEST_CS_LIMIT 0x00004802 +#define VMCS_GUEST_SS_LIMIT 0x00004804 +#define VMCS_GUEST_DS_LIMIT 0x00004806 +#define VMCS_GUEST_FS_LIMIT 0x00004808 +#define VMCS_GUEST_GS_LIMIT 0x0000480a +#define VMCS_GUEST_LDTR_LIMIT 0x0000480c +#define VMCS_GUEST_TR_LIMIT 0x0000480e +#define VMCS_GUEST_GDTR_LIMIT 0x00004810 +#define VMCS_GUEST_IDTR_LIMIT 0x00004812 +#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814 +#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816 +#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818 +#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481a +#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481c +#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481e +#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820 +#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822 +#define VMCS_GUEST_ACTIVITY_STATE 0x00004824 +#define VMCS_GUEST_INTERRUPTIBILITY_INFO 0x00004826 +#define VMCS_GUEST_SYSENTER_CS 0x0000482a +#define VMCS_GUEST_CR0 0x00006800 +#define VMCS_GUEST_CR3 0x00006802 +#define VMCS_GUEST_CR4 0x00006804 +#define VMCS_GUEST_ES_BASE 0x00006806 +#define VMCS_GUEST_CS_BASE 0x00006808 +#define VMCS_GUEST_SS_BASE 0x0000680a +#define VMCS_GUEST_DS_BASE 0x0000680c +#define VMCS_GUEST_FS_BASE 0x0000680e +#define VMCS_GUEST_GS_BASE 0x00006810 +#define VMCS_GUEST_LDTR_BASE 0x00006812 +#define VMCS_GUEST_TR_BASE 0x00006814 +#define VMCS_GUEST_GDTR_BASE 0x00006816 +#define VMCS_GUEST_IDTR_BASE 0x00006818 +#define VMCS_GUEST_DR7 0x0000681a +#define VMCS_GUEST_RSP 0x0000681c +#define VMCS_GUEST_RIP 0x0000681e +#define VMCS_GUEST_RFLAGS 0x00006820 +#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822 +#define VMCS_GUEST_SYSENTER_ESP 0x00006824 +#define VMCS_GUEST_SYSENTER_EIP 0x00006826 + +// VMCB (Virtual Machine Control Block) Field Offsets +// (From AMD64 Programmer's Manual Vol 2, Appendix B) + +// Control Area +#define VMCB_CTRL_INTERCEPT_VEC3 0x0c +#define VMCB_CTRL_INTERCEPT_HLT (1 << 24) // Bit 24 in VEC3 +#define VMCB_CTRL_INTERCEPT_VEC4 0x10 +// Bits 0-9: intercept VMRUN, VMMCALL, VMLOAD, VMSAVE, STGI, CLGI, SKINIT, RDTSCP, ICEBP, WBINVD. +#define VMCB_CTRL_INTERCEPT_VEC4_ALL (0x3ff) + +#define VMCB_CTRL_ASID 0x058 +#define VMCB_EXIT_CODE 0x070 + +// NP_ENABLE is actually 1 byte, but the 7 following bytes are reserved, so it's okay +#define VMCB_CTRL_NP_ENABLE 0x090 +#define VMCB_CTRL_NPT_ENABLE_BIT 0 + +#define VMCB_CTRL_N_CR3 0x0b0 + +// Guest State Area (starts at 0x400) +#define VMCB_GUEST_ES_SEL 0x400 +#define VMCB_GUEST_ES_ATTR 0x402 +#define VMCB_GUEST_ES_LIM 0x404 +#define VMCB_GUEST_ES_BASE 0x408 +#define VMCB_GUEST_CS_SEL 0x410 +#define VMCB_GUEST_CS_ATTR 0x412 +#define VMCB_GUEST_CS_LIM 0x414 +#define VMCB_GUEST_CS_BASE 0x418 +#define VMCB_GUEST_SS_SEL 0x420 +#define VMCB_GUEST_SS_ATTR 0x422 +#define VMCB_GUEST_SS_LIM 0x424 +#define VMCB_GUEST_SS_BASE 0x428 +#define VMCB_GUEST_DS_SEL 0x430 +#define VMCB_GUEST_DS_ATTR 0x432 +#define VMCB_GUEST_DS_LIM 0x434 +#define VMCB_GUEST_DS_BASE 0x438 +#define VMCB_GUEST_FS_SEL 0x440 +#define VMCB_GUEST_FS_ATTR 0x442 +#define VMCB_GUEST_FS_LIM 0x444 +#define VMCB_GUEST_FS_BASE 0x448 +#define VMCB_GUEST_GS_SEL 0x450 +#define VMCB_GUEST_GS_ATTR 0x452 +#define VMCB_GUEST_GS_LIM 0x454 +#define VMCB_GUEST_GS_BASE 0x458 + +#define VMCB_GUEST_IDTR_SEL 0x480 +#define VMCB_GUEST_IDTR_ATTR 0x482 +#define VMCB_GUEST_IDTR_LIM 0x484 +#define VMCB_GUEST_IDTR_BASE 0x488 +#define VMCB_GUEST_GDTR_SEL 0x460 +#define VMCB_GUEST_GDTR_ATTR 0x462 +#define VMCB_GUEST_GDTR_LIM 0x464 +#define VMCB_GUEST_GDTR_BASE 0x468 +#define VMCB_GUEST_LDTR_SEL 0x470 +#define VMCB_GUEST_LDTR_ATTR 0x472 +#define VMCB_GUEST_LDTR_LIM 0x474 +#define VMCB_GUEST_LDTR_BASE 0x478 +#define VMCB_GUEST_TR_SEL 0x490 +#define VMCB_GUEST_TR_ATTR 0x492 +#define VMCB_GUEST_TR_LIM 0x494 +#define VMCB_GUEST_TR_BASE 0x498 + +#define VMCB_GUEST_EFER 0x4d0 +#define VMCB_GUEST_CR4 0x548 +#define VMCB_GUEST_CR3 0x550 +#define VMCB_GUEST_CR0 0x558 +#define VMCB_GUEST_DR7 0x560 +#define VMCB_GUEST_DR6 0x568 +#define VMCB_GUEST_RFLAGS 0x570 +#define VMCB_GUEST_RIP 0x578 +#define VMCB_GUEST_RSP 0x5d8 +#define VMCB_GUEST_PAT 0x668 +#define VMCB_GUEST_DEBUGCTL 0x670 + +// SVM Segment Attribute Defines +#define SVM_ATTR_G (1 << 15) +#define SVM_ATTR_DB (1 << 14) +#define SVM_ATTR_L (1 << 13) +#define SVM_ATTR_P (1 << 7) +#define SVM_ATTR_S (1 << 4) +// Type bits. +#define SVM_ATTR_TYPE_A (1 << 0) +#define SVM_ATTR_TYPE_RW (1 << 1) +#define SVM_ATTR_TYPE_E (1 << 3) + +// 64-bit Code Segment: P=1, S=1, Type=11 (E/R/A), L=1, G=1 +#define SVM_ATTR_64BIT_CODE \ + (SVM_ATTR_P | SVM_ATTR_S | SVM_ATTR_TYPE_E | SVM_ATTR_TYPE_RW | \ + SVM_ATTR_TYPE_A | SVM_ATTR_L | SVM_ATTR_G) + +// 64-bit Data Segment: P=1, S=1, Type=3 (RW/A), D/B=1, G=1 +#define SVM_ATTR_64BIT_DATA \ + (SVM_ATTR_P | SVM_ATTR_S | SVM_ATTR_TYPE_RW | SVM_ATTR_TYPE_A | \ + SVM_ATTR_DB | SVM_ATTR_G) + #define X86_NEXT_INSN $0xbadc0de #define X86_PREFIX_SIZE 0xba1d #endif // x86-specific definitions. @@ -213,4 +530,4 @@ #endif // ARM64 SYZOS definitions -#endif // EXECUTOR_KVM_H \ No newline at end of file +#endif // EXECUTOR_KVM_H diff --git a/sys/linux/dev_kvm_amd64.txt b/sys/linux/dev_kvm_amd64.txt index ed2880063d2f..cb116574dd0d 100644 --- a/sys/linux/dev_kvm_amd64.txt +++ b/sys/linux/dev_kvm_amd64.txt @@ -94,17 +94,29 @@ syzos_api_set_irq_handler { arg_handler_type int64[0:2] } +type syzos_api_vm_id int64[0:3] + +syzos_api_nested_load_code { + vm_id syzos_api_vm_id + insns text[x86_64] +} [packed] + +# IDs here must match those in executor/common_kvm_amd64_syzos.h. syzos_api_call$x86 [ - uexit syzos_api$x86[0, intptr] - code syzos_api$x86[10, syzos_api_code$x86] - cpuid syzos_api$x86[20, syzos_api_cpuid] - wrmsr syzos_api$x86[30, syzos_api_wrmsr] - rdmsr syzos_api$x86[50, syzos_api_rdmsr] - wr_crn syzos_api$x86[70, syzos_api_wr_crn] - wr_drn syzos_api$x86[110, syzos_api_wr_drn] - in_dx syzos_api$x86[130, syzos_api_in_dx] - out_dx syzos_api$x86[170, syzos_api_out_dx] - set_irq_handler syzos_api$x86[190, syzos_api_set_irq_handler] + uexit syzos_api$x86[0, intptr] + code syzos_api$x86[10, syzos_api_code$x86] + cpuid syzos_api$x86[100, syzos_api_cpuid] + wrmsr syzos_api$x86[101, syzos_api_wrmsr] + rdmsr syzos_api$x86[102, syzos_api_rdmsr] + wr_crn syzos_api$x86[103, syzos_api_wr_crn] + wr_drn syzos_api$x86[104, syzos_api_wr_drn] + in_dx syzos_api$x86[105, syzos_api_in_dx] + out_dx syzos_api$x86[106, syzos_api_out_dx] + set_irq_handler syzos_api$x86[200, syzos_api_set_irq_handler] + enable_nested syzos_api$x86[300, const[0, intptr]] + nested_create_vm syzos_api$x86[301, syzos_api_vm_id] + nested_load_code syzos_api$x86[302, syzos_api_nested_load_code] + nested_vmlaunch syzos_api$x86[303, syzos_api_vm_id] ] [varlen] kvm_text_x86 [