diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 6aa40ee05a4ae..39f99b52c188f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7245,6 +7245,41 @@ exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the ``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. Userspace need not do anything if it does not wish to support a TDVMCALL. + +:: + + /* KVM_EXIT_ARM_SEA */ + struct { + #define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; + +Used on arm64 systems. When the VM capability ``KVM_CAP_ARM_SEA_TO_USER`` is +enabled, a KVM exits to userspace if a guest access causes a synchronous +external abort (SEA) and the host APEI fails to handle the SEA. + +``esr`` is set to a sanitized value of ESR_EL2 from the exception taken to KVM, +consisting of the following fields: + + - ``ESR_EL2.EC`` + - ``ESR_EL2.IL`` + - ``ESR_EL2.FnV`` + - ``ESR_EL2.EA`` + - ``ESR_EL2.CM`` + - ``ESR_EL2.WNR`` + - ``ESR_EL2.FSC`` + - ``ESR_EL2.SET`` (when FEAT_RAS is implemented for the VM) + +``gva`` is set to the value of FAR_EL2 from the exception taken to KVM when +``ESR_EL2.FnV == 0``. Otherwise, the value of ``gva`` is unknown. + +``gpa`` is set to the faulting IPA from the exception taken to KVM when +the ``KVM_EXIT_ARM_SEA_FLAG_GPA_VALID`` flag is set. Otherwise, the value of +``gpa`` is unknown. + :: /* Fix the size of the union. */ @@ -8662,6 +8697,18 @@ This capability indicate to the userspace whether a PFNMAP memory region can be safely mapped as cacheable. This relies on the presence of force write back (FWB) feature support on the hardware. +7.45 KVM_CAP_ARM_SEA_TO_USER +---------------------------- + +:Architecture: arm64 +:Target: VM +:Parameters: none +:Returns: 0 on success, -EINVAL if unsupported. + +When this capability is enabled, KVM may exit to userspace for SEAs taken to +EL2 resulting from a guest access. See ``KVM_EXIT_ARM_SEA`` for more +information. + 8. Other capabilities. ====================== diff --git a/MAINTAINERS b/MAINTAINERS index 85bd4a19a6c42..9a83ca44c7d29 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11394,6 +11394,7 @@ M: Miaohe Lin R: Naoya Horiguchi L: linux-mm@kvack.org S: Maintained +F: include/linux/memory-failure.h F: mm/hwpoison-inject.c F: mm/memory-failure.c diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 4d8c720c881f4..3ae36da9b0986 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1813,9 +1813,7 @@ CONFIG_MEMTEST=y CONFIG_NVGRACE_GPU_VFIO_PCI=m CONFIG_NVGRACE_EGM=m CONFIG_VFIO_DEVICE_CDEV=y -# CONFIG_VFIO_CONTAINER is not set CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_DRIVER=y CONFIG_IOMMUFD=y CONFIG_IOMMUFD_TEST=y -CONFIG_IOMMUFD_VFIO_CONTAINER=y diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0ee4f6fa3a172..ca550e369b59a 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -349,6 +349,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */ #define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS 10 + /* Unhandled SEAs are taken to userspace */ +#define KVM_ARCH_FLAG_EXIT_SEA 11 unsigned long flags; /* VM-wide vCPU feature set */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3036df0cc2013..034b5cecaaa75 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -133,6 +133,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SEA_TO_USER: + r = 0; + set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags); + break; default: break; } @@ -322,6 +326,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS: + case KVM_CAP_ARM_SEA_TO_USER: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 70eb0b5a71bc5..35a3176772c87 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1493,7 +1493,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool s2_force_noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; - unsigned long mt; struct kvm *kvm = vcpu->kvm; struct vm_area_struct *vma; short vma_shift; @@ -1613,8 +1612,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = min(vma_pagesize, (long)max_map_size); } - mt = FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot)); - /* * Both the canonical IPA and fault IPA must be hugepage-aligned to * ensure we find the right PFN and lay down the mapping in the right @@ -1698,7 +1695,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, writable = false; } - if (exec_fault && s2_force_noncacheable && mt != MT_NORMAL) + if (exec_fault && s2_force_noncacheable) ret = -ENOEXEC; if (ret) { @@ -1819,8 +1816,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) read_unlock(&vcpu->kvm->mmu_lock); } +/* + * Returns true if the SEA should be handled locally within KVM if the abort + * is caused by a kernel memory allocation (e.g. stage-2 table memory). + */ +static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr) +{ + /* + * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort + * taken from a guest EL to EL2 is due to a host-imposed access (e.g. + * stage-2 PTW). + */ + if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) + return true; + + /* KVM owns the VNCR when the vCPU isn't in a nested context. */ + if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR)) + return true; + + /* + * Determining if an external abort during a table walk happened at + * stage-2 is only possible with S1PTW is set. Otherwise, since KVM + * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the + * PA of the stage-1 descriptor) can reach here and are reported + * with a TTW ESR value. + */ + return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW)); +} + int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + u64 esr = kvm_vcpu_get_esr(vcpu); + u64 esr_mask = ESR_ELx_EC_MASK | + ESR_ELx_IL | + ESR_ELx_FnV | + ESR_ELx_EA | + ESR_ELx_CM | + ESR_ELx_WNR | + ESR_ELx_FSC; + u64 ipa; + /* * Give APEI the opportunity to claim the abort before handling it * within KVM. apei_claim_sea() expects to be called with IRQs enabled. @@ -1829,7 +1866,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu) if (apei_claim_sea(NULL) == 0) return 1; - return kvm_inject_serror(vcpu); + if (host_owns_sea(vcpu, esr) || + !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags)) + return kvm_inject_serror(vcpu); + + /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */ + if (kvm_has_ras(kvm)) + esr_mask |= ESR_ELx_SET_MASK; + + /* + * Exit to userspace, and provide faulting guest virtual and physical + * addresses in case userspace wants to emulate SEA to guest by + * writing to FAR_ELx and HPFAR_ELx registers. + */ + memset(&run->arm_sea, 0, sizeof(run->arm_sea)); + run->exit_reason = KVM_EXIT_ARM_SEA; + run->arm_sea.esr = esr & esr_mask; + + if (!(esr & ESR_ELx_FnV)) + run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu); + + ipa = kvm_vcpu_get_fault_ipa(vcpu); + if (ipa != INVALID_GPA) { + run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID; + run->arm_sea.gpa = ipa; + } + + return 0; } /** diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 25b0ecc76c098..f6a2c7053068c 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -8,11 +8,7 @@ #include #include #include - -#ifdef CONFIG_MEMORY_FAILURE -#include #include -#endif #define MAX_EGM_NODES 256 @@ -31,10 +27,7 @@ struct egm_region { struct cdev cdev; struct list_head gpus; DECLARE_HASHTABLE(htbl, 0x10); -#ifdef CONFIG_MEMORY_FAILURE struct pfn_address_space pfn_address_space; - bool pfn_space_registered; -#endif }; struct h_node { @@ -46,77 +39,97 @@ static dev_t dev; static struct class *class; static struct list_head egm_list; -#ifdef CONFIG_MEMORY_FAILURE -static void -nvgrace_egm_pfn_memory_failure(struct pfn_address_space *pfn_space, - unsigned long pfn) +static int pfn_memregion_offset(struct egm_region *region, + unsigned long pfn, + pgoff_t *pfn_offset_in_region) { - struct egm_region *region = - container_of(pfn_space, struct egm_region, pfn_address_space); - unsigned long mem_offset = PFN_PHYS(pfn - pfn_space->node.start); - struct h_node *ecc; + unsigned long start_pfn, num_pages; - if (mem_offset >= region->egmlength) - return; + start_pfn = PHYS_PFN(region->egmphys); + num_pages = region->egmlength >> PAGE_SHIFT; - /* - * MM has called to notify a poisoned page. Track that in the hastable. - */ - ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); - if (!ecc) - return; /* Silently fail on allocation error */ - ecc->mem_offset = mem_offset; - hash_add(region->htbl, &ecc->node, ecc->mem_offset); -} + if (pfn < start_pfn || pfn >= start_pfn + num_pages) + return -EFAULT; -struct pfn_address_space_ops nvgrace_egm_pas_ops = { - .failure = nvgrace_egm_pfn_memory_failure, -}; + *pfn_offset_in_region = pfn - start_pfn; -static int -nvgrace_egm_register_pfn_range(struct egm_region *region, - struct vm_area_struct *vma) + return 0; +} + +static int track_ecc_offset(struct egm_region *region, + unsigned long mem_offset) { - unsigned long nr_pages = region->egmlength >> PAGE_SHIFT; + struct h_node *cur_page, *ecc_page; + unsigned long bkt; + + hash_for_each(region->htbl, bkt, cur_page, node) { + if (cur_page->mem_offset == mem_offset) + return 0; + } + + ecc_page = (struct h_node *)(vzalloc(sizeof(struct h_node))); + if (!ecc_page) + return -ENOMEM; + + ecc_page->mem_offset = mem_offset; - region->pfn_address_space.node.start = vma->vm_pgoff; - region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; - region->pfn_address_space.ops = &nvgrace_egm_pas_ops; - region->pfn_address_space.mapping = vma->vm_file->f_mapping; + hash_add(region->htbl, &ecc_page->node, ecc_page->mem_offset); - return register_pfn_address_space(®ion->pfn_address_space); + return 0; } -static vm_fault_t nvgrace_egm_fault(struct vm_fault *vmf) +static int nvgrace_egm_pfn_to_vma_pgoff(struct vm_area_struct *vma, + unsigned long pfn, + pgoff_t *pgoff) { - unsigned long mem_offset = PFN_PHYS(vmf->pgoff - vmf->vma->vm_pgoff); - struct egm_region *region = vmf->vma->vm_file->private_data; - struct h_node *cur; + struct egm_region *region = vma->vm_file->private_data; + pgoff_t vma_offset_in_region = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + pgoff_t pfn_offset_in_region; + int ret; - /* - * Check if the page is poisoned. - */ - if (mem_offset < region->egmlength) { - hash_for_each_possible(region->htbl, cur, node, mem_offset) { - if (cur->mem_offset == mem_offset) - return VM_FAULT_HWPOISON; - } - } + ret = pfn_memregion_offset(region, pfn, &pfn_offset_in_region); + if (ret) + return ret; - return VM_FAULT_ERROR; + /* Ensure PFN is not before VMA's start within the region */ + if (pfn_offset_in_region < vma_offset_in_region) + return -EFAULT; + + /* Calculate offset from VMA start */ + *pgoff = vma->vm_pgoff + + (pfn_offset_in_region - vma_offset_in_region); + + /* Track and save the poisoned offset */ + return track_ecc_offset(region, *pgoff << PAGE_SHIFT); } -static const struct vm_operations_struct nvgrace_egm_mmap_ops = { - .fault = nvgrace_egm_fault, -}; +static int +nvgrace_egm_vfio_pci_register_pfn_range(struct inode *inode, + struct egm_region *region) +{ + int ret; + unsigned long pfn, nr_pages; -#endif + pfn = PHYS_PFN(region->egmphys); + nr_pages = region->egmlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = pfn; + region->pfn_address_space.node.last = pfn + nr_pages - 1; + region->pfn_address_space.mapping = inode->i_mapping; + region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_egm_pfn_to_vma_pgoff; + + ret = register_pfn_address_space(®ion->pfn_address_space); + + return ret; +} static int nvgrace_egm_open(struct inode *inode, struct file *file) { void *memaddr; struct egm_region *region = container_of(inode->i_cdev, struct egm_region, cdev); + int ret; if (atomic_inc_return(®ion->open_count) > 1) return 0; @@ -131,6 +144,12 @@ static int nvgrace_egm_open(struct inode *inode, struct file *file) memunmap(memaddr); file->private_data = region; + ret = nvgrace_egm_vfio_pci_register_pfn_range(inode, region); + if (ret && ret != -EOPNOTSUPP) { + file->private_data = NULL; + return ret; + } + return 0; } @@ -140,12 +159,8 @@ static int nvgrace_egm_release(struct inode *inode, struct file *file) struct egm_region, cdev); if (atomic_dec_and_test(®ion->open_count)) { -#ifdef CONFIG_MEMORY_FAILURE - if (region->pfn_space_registered) { - unregister_pfn_address_space(®ion->pfn_address_space); - region->pfn_space_registered = false; - } -#endif + unregister_pfn_address_space(®ion->pfn_address_space); + file->private_data = NULL; } @@ -164,19 +179,6 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma) PHYS_PFN(region->egmphys), (vma->vm_end - vma->vm_start), vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = PHYS_PFN(region->egmphys); - -#ifdef CONFIG_MEMORY_FAILURE - vma->vm_ops = &nvgrace_egm_mmap_ops; - - ret = nvgrace_egm_register_pfn_range(region, vma); - if (ret == 0) - region->pfn_space_registered = true; -#endif - return ret; } @@ -465,9 +467,6 @@ int register_egm_node(struct pci_dev *pdev) region->egmpxm = egmpxm; hash_init(region->htbl); -#ifdef CONFIG_MEMORY_FAILURE - region->pfn_space_registered = false; -#endif INIT_LIST_HEAD(®ion->gpus); atomic_set(®ion->open_count, 0); diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a03998f27475d..7a4b46d972fe1 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,19 +7,9 @@ #include #include #include -#include #include - -#ifdef CONFIG_MEMORY_FAILURE -#include +#include #include -#include -#endif - -struct h_node { - unsigned long mem_offset; - struct hlist_node node; -}; /* * The device memory usable to the workloads running in the VM is cached @@ -60,10 +50,7 @@ struct mem_region { void *memaddr; void __iomem *ioaddr; }; /* Base virtual address of the region */ -#ifdef CONFIG_MEMORY_FAILURE struct pfn_address_space pfn_address_space; - DECLARE_HASHTABLE(htbl, 8); -#endif }; struct nvgrace_gpu_pci_core_device { @@ -75,124 +62,109 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + /* GPU has just been reset */ + bool reset_done; int egm_node; }; static bool egm_enabled; -#ifdef CONFIG_MEMORY_FAILURE -static void -nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, - unsigned long pfn) +static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) { - struct mem_region *region = container_of(pfn_space, - struct mem_region, pfn_address_space); - unsigned long mem_offset = pfn - pfn_space->node.start; - struct h_node *ecc; - - if (mem_offset >= region->memlength) - return; + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_vdev, struct nvgrace_gpu_pci_core_device, + core_device.vdev); - /* - * MM has called to notify a poisoned page. Track that in the hastable. - */ - ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); - ecc->mem_offset = mem_offset; - hash_add(region->htbl, &ecc->node, ecc->mem_offset); + nvdev->resmem.bar_val = 0; + nvdev->usemem.bar_val = 0; } -struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = { - .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure, -}; +/* Choose the structure corresponding to the fake BAR with a given index. */ +static struct mem_region * +nvgrace_gpu_memregion(int index, + struct nvgrace_gpu_pci_core_device *nvdev) +{ + if (index == USEMEM_REGION_INDEX) + return &nvdev->usemem; -static int -nvgrace_gpu_vfio_pci_register_pfn_range(struct mem_region *region, - struct vm_area_struct *vma) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) + return &nvdev->resmem; + + return NULL; +} + +static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev, + unsigned int index, + unsigned long pfn, + pgoff_t *pfn_offset_in_region) { - unsigned long nr_pages; - int ret = 0; + struct mem_region *region; + unsigned long start_pfn, num_pages; - nr_pages = region->memlength >> PAGE_SHIFT; + region = nvgrace_gpu_memregion(index, nvdev); + if (!region) + return -EINVAL; - region->pfn_address_space.node.start = vma->vm_pgoff; - region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; - region->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops; - region->pfn_address_space.mapping = vma->vm_file->f_mapping; + start_pfn = PHYS_PFN(region->memphys); + num_pages = region->memlength >> PAGE_SHIFT; - ret = register_pfn_address_space(®ion->pfn_address_space); + if (pfn < start_pfn || pfn >= start_pfn + num_pages) + return -EFAULT; - return ret; + *pfn_offset_in_region = pfn - start_pfn; + + return 0; } -extern struct vfio_device *vfio_device_from_file(struct file *file); +static inline +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma); -static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma, + unsigned long pfn, + pgoff_t *pgoff) { - unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; - struct vfio_device *core_vdev; struct nvgrace_gpu_pci_core_device *nvdev; - struct h_node *cur; - - if (!(vmf->vma->vm_file)) - goto error_exit; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + pgoff_t vma_offset_in_region = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + pgoff_t pfn_offset_in_region; + int ret; - core_vdev = vfio_device_from_file(vmf->vma->vm_file); + nvdev = vma_to_nvdev(vma); + if (!nvdev) + return -ENOENT; - if (!core_vdev) - goto error_exit; - - nvdev = container_of(core_vdev, - struct nvgrace_gpu_pci_core_device, - core_device.vdev); + ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region); + if (ret) + return ret; - /* - * Check if the page is poisoned. - */ - if (mem_offset < (nvdev->resmem.memlength >> PAGE_SHIFT)) { - hash_for_each_possible(nvdev->resmem.htbl, cur, node, mem_offset) { - if (cur->mem_offset == mem_offset) - return VM_FAULT_HWPOISON; - } - } + /* Ensure PFN is not before VMA's start within the region */ + if (pfn_offset_in_region < vma_offset_in_region) + return -EFAULT; - if (mem_offset < (nvdev->usemem.memlength >> PAGE_SHIFT)) { - hash_for_each_possible(nvdev->usemem.htbl, cur, node, mem_offset) { - if (cur->mem_offset == mem_offset) - return VM_FAULT_HWPOISON; - } - } + /* Calculate offset from VMA start */ + *pgoff = vma->vm_pgoff + + (pfn_offset_in_region - vma_offset_in_region); -error_exit: - return VM_FAULT_ERROR; + return 0; } -static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { - .fault = nvgrace_gpu_vfio_pci_fault, -}; -#endif - -static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev, + struct mem_region *region) { - struct nvgrace_gpu_pci_core_device *nvdev = - container_of(core_vdev, struct nvgrace_gpu_pci_core_device, - core_device.vdev); + unsigned long pfn, nr_pages; - nvdev->resmem.bar_val = 0; - nvdev->usemem.bar_val = 0; -} - -/* Choose the structure corresponding to the fake BAR with a given index. */ -static struct mem_region * -nvgrace_gpu_memregion(int index, - struct nvgrace_gpu_pci_core_device *nvdev) -{ - if (index == USEMEM_REGION_INDEX) - return &nvdev->usemem; + pfn = PHYS_PFN(region->memphys); + nr_pages = region->memlength >> PAGE_SHIFT; - if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) - return &nvdev->resmem; + region->pfn_address_space.node.start = pfn; + region->pfn_address_space.node.last = pfn + nr_pages - 1; + region->pfn_address_space.mapping = core_vdev->inode->i_mapping; + region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff; - return NULL; + return register_pfn_address_space(®ion->pfn_address_space); } static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) @@ -213,9 +185,36 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) mutex_init(&nvdev->remap_lock); } - vfio_pci_core_finish_enable(vdev); + /* + * GPU readiness is checked by reading the BAR0 registers. + * + * ioremap BAR0 to ensure that the BAR0 mapping is present before + * register reads on first fault before establishing any GPU + * memory mapping. + */ + ret = vfio_pci_core_setup_barmap(vdev, 0); + if (ret) + goto error_exit; + + if (nvdev->resmem.memlength) { + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem); + if (ret && ret != -EOPNOTSUPP) + goto error_exit; + } + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem); + if (ret && ret != -EOPNOTSUPP) + goto register_mem_failed; + + vfio_pci_core_finish_enable(vdev); return 0; + +register_mem_failed: + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); +error_exit: + vfio_pci_core_disable(vdev); + return ret; } static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) @@ -224,6 +223,11 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); + + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); + /* Unmap the mapping to the device memory cached region */ if (nvdev->usemem.memaddr) { memunmap(nvdev->usemem.memaddr); @@ -238,14 +242,119 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->remap_lock); -#ifdef CONFIG_MEMORY_FAILURE - if (nvdev->resmem.memlength) - unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); - unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); -#endif vfio_pci_core_close_device(core_vdev); } +static int nvgrace_gpu_wait_device_ready(void __iomem *io) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) + return 0; + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + + return -ETIME; +} + +/* + * If the GPU memory is accessed by the CPU while the GPU is not ready + * after reset, it can cause harmless corrected RAS events to be logged. + * Make sure the GPU is ready before establishing the mappings. + */ +static int +nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) +{ + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; + + lockdep_assert_held_read(&vdev->memory_lock); + + if (!nvdev->reset_done) + return 0; + + if (!__vfio_pci_memory_enabled(vdev)) + return -EIO; + + ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (ret) + return ret; + + nvdev->reset_done = false; + + return 0; +} + +static unsigned long addr_to_pgoff(struct vm_area_struct *vma, + unsigned long addr) +{ + u64 pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + + return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data; + struct vfio_pci_core_device *vdev = &nvdev->core_device; + unsigned int index = + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + vm_fault_t ret = VM_FAULT_FALLBACK; + struct mem_region *memregion; + unsigned long pfn, addr; + + memregion = nvgrace_gpu_memregion(index, nvdev); + if (!memregion) + return VM_FAULT_SIGBUS; + + addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); + pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) { + if (vdev->pm_runtime_engaged || + nvgrace_gpu_check_device_ready(nvdev)) + return VM_FAULT_SIGBUS; + + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); + } + } + + dev_dbg_ratelimited(&vdev->pdev->dev, + "%s order = %d pfn 0x%lx: 0x%x\n", + __func__, order, pfn, + (unsigned int)ret); + + return ret; +} + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0); +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP + .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, +#endif +}; + +static inline +struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma) +{ + /* Check if this VMA belongs to us */ + if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops) + return NULL; + + return vma->vm_private_data; +} + static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { @@ -253,10 +362,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); struct mem_region *memregion; - unsigned long start_pfn; u64 req_len, pgoff, end; unsigned int index; - int ret = 0; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -273,17 +380,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) || - check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) || check_add_overflow(PFN_PHYS(pgoff), req_len, &end)) return -EOVERFLOW; /* - * Check that the mapping request does not go beyond available device - * memory size + * Check that the mapping request does not go beyond the exposed + * device memory size. */ if (end > memregion->memlength) return -EINVAL; + vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP); + /* * The carved out region of the device memory needs the NORMAL_NC * property. Communicate as such to the hypervisor. @@ -300,35 +408,10 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); } - /* - * Perform a PFN map to the memory and back the device BAR by the - * GPU memory. - * - * The available GPU memory size may not be power-of-2 aligned. The - * remainder is only backed by vfio_device_ops read/write handlers. - * - * During device reset, the GPU is safely disconnected to the CPU - * and access to the BAR will be immediately returned preventing - * machine check. - */ - ret = remap_pfn_range(vma, vma->vm_start, start_pfn, - req_len, vma->vm_page_prot); - if (ret) - return ret; - - vma->vm_pgoff = start_pfn; - -#ifdef CONFIG_MEMORY_FAILURE vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + vma->vm_private_data = nvdev; - if (index == VFIO_PCI_BAR2_REGION_INDEX) { - WARN_ON_ONCE(!nvdev->has_mig_hw_bug); - ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma); - } else - ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma); -#endif - - return ret; + return 0; } static long @@ -636,6 +719,7 @@ static ssize_t nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); struct mem_region *memregion; @@ -662,9 +746,15 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } /* * Only the device memory present on the hardware is mapped, which may @@ -689,9 +779,16 @@ nvgrace_gpu_read(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_read_mem(nvdev, buf, count, ppos); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos); @@ -753,6 +850,7 @@ static ssize_t nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, size_t count, loff_t *ppos, const char __user *buf) { + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); u64 offset = *ppos & VFIO_PCI_OFFSET_MASK; struct mem_region *memregion; @@ -782,9 +880,15 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); - ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); - if (ret) - return ret; + scoped_guard(rwsem_read, &vdev->memory_lock) { + ret = nvgrace_gpu_check_device_ready(nvdev); + if (ret) + return ret; + + ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos); + if (ret) + return ret; + } exitfn: *ppos += count; @@ -798,10 +902,17 @@ nvgrace_gpu_write(struct vfio_device *core_vdev, struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + struct vfio_pci_core_device *vdev = &nvdev->core_device; unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; - if (nvgrace_gpu_memregion(index, nvdev)) - return nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + if (nvgrace_gpu_memregion(index, nvdev)) { + if (pm_runtime_resume_and_get(&vdev->pdev->dev)) + return -EIO; + ret = nvgrace_gpu_write_mem(nvdev, count, ppos, buf); + pm_runtime_put(&vdev->pdev->dev); + return ret; + } if (index == VFIO_PCI_CONFIG_REGION_INDEX) return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos); @@ -1026,11 +1137,10 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * Ensure that the BAR0 region is enabled before accessing the * registers. */ -static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) { - unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); void __iomem *io; - int ret = -ETIME; + int ret; ret = pci_enable_device(pdev); if (ret) @@ -1046,16 +1156,8 @@ static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) goto iomap_exit; } - do { - if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && - (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { - ret = 0; - goto reg_check_exit; - } - msleep(POLL_QUANTUM_MS); - } while (!time_after(jiffies, timeout)); + ret = nvgrace_gpu_wait_device_ready(io); -reg_check_exit: pci_iounmap(pdev, io); iomap_exit: pci_release_selected_regions(pdev, 1 << 0); @@ -1073,7 +1175,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 egmpxm; int ret; - ret = nvgrace_gpu_wait_device_ready(pdev); + ret = nvgrace_gpu_probe_check_device_ready(pdev); if (ret) return ret; @@ -1119,14 +1221,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, if (ret) goto out_egm_unreg; -#ifdef CONFIG_MEMORY_FAILURE - /* - * Initialize the hashtable tracking the poisoned pages. - */ - hash_init(nvdev->resmem.htbl); - hash_init(nvdev->usemem.htbl); -#endif - return ret; out_egm_unreg: @@ -1140,25 +1234,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, static void nvgrace_gpu_remove(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); - struct nvgrace_gpu_pci_core_device *nvdev = - container_of(core_device, struct nvgrace_gpu_pci_core_device, - core_device); - -#ifdef CONFIG_MEMORY_FAILURE - struct h_node *cur; - unsigned long bkt; - struct hlist_node *tmp_node; - - hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) { - hash_del(&cur->node); - vfree(cur); - } - - hash_for_each_safe(nvdev->usemem.htbl, bkt, tmp_node, cur, node) { - hash_del(&cur->node); - vfree(cur); - } -#endif if (egm_enabled) unregister_egm_node(pdev); @@ -1183,12 +1258,38 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table); +/* + * The GPU reset is required to be serialized against the *first* mapping + * faults and read/writes accesses to prevent potential RAS events logging. + * + * First fault or access after a reset needs to poll device readiness, + * flag that a reset has occurred. The readiness test is done by holding + * the memory_lock read lock and we expect all vfio-pci initiated resets to + * hold the memory_lock write lock to avoid races. However, .reset_done + * extends beyond the scope of vfio-pci initiated resets therefore we + * cannot assert this behavior and use lockdep_assert_held_write. + */ +static void nvgrace_gpu_vfio_pci_reset_done(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + + nvdev->reset_done = true; +} + +static const struct pci_error_handlers nvgrace_gpu_vfio_pci_err_handlers = { + .reset_done = nvgrace_gpu_vfio_pci_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + static struct pci_driver nvgrace_gpu_vfio_pci_driver = { .name = KBUILD_MODNAME, .id_table = nvgrace_gpu_vfio_pci_table, .probe = nvgrace_gpu_probe, .remove = nvgrace_gpu_remove, - .err_handler = &vfio_pci_core_err_handlers, + .err_handler = &nvgrace_gpu_vfio_pci_err_handlers, .driver_managed_dma = true, }; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 333fd149c21a5..4abd4f2719958 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -95,7 +95,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = { [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ - [PCI_EXT_CAP_ID_PASID] = PCI_EXT_CAP_PASID_SIZEOF, /* not yet */ + [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ [PCI_EXT_CAP_ID_DVSEC] = 0xFF, }; @@ -416,6 +416,7 @@ bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev) return pdev->current_state < PCI_D3hot && (pdev->no_command_memory || (cmd & PCI_COMMAND_MEMORY)); } +EXPORT_SYMBOL_GPL(__vfio_pci_memory_enabled); /* * Restore the *real* BARs after we detect a FLR or backdoor reset. diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 7dcf5439dedc9..54c2133501718 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1640,49 +1640,49 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma) return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff; } -static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, - unsigned int order) +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, + unsigned long pfn, + unsigned int order) { - struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_core_device *vdev = vma->vm_private_data; - unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); - unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - unsigned long pfn = vma_to_pfn(vma) + pgoff; - vm_fault_t ret = VM_FAULT_SIGBUS; - - if (order && (addr < vma->vm_start || - addr + (PAGE_SIZE << order) > vma->vm_end || - pfn & ((1 << order) - 1))) { - ret = VM_FAULT_FALLBACK; - goto out; - } - - down_read(&vdev->memory_lock); + lockdep_assert_held_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) - goto out_unlock; + return VM_FAULT_SIGBUS; switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn); - break; + return vmf_insert_pfn(vmf->vma, vmf->address, pfn); #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, pfn, false); - break; + return vmf_insert_pfn_pmd(vmf, pfn, false); #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, pfn, false); + return vmf_insert_pfn_pud(vmf, pfn, false); break; #endif default: - ret = VM_FAULT_FALLBACK; + return VM_FAULT_FALLBACK; + } +} +EXPORT_SYMBOL_GPL(vfio_pci_vmf_insert_pfn); + +static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, + unsigned int order) +{ + struct vm_area_struct *vma = vmf->vma; + struct vfio_pci_core_device *vdev = vma->vm_private_data; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; + vm_fault_t ret = VM_FAULT_FALLBACK; + + if (is_aligned_for_order(vma, addr, pfn, order)) { + scoped_guard(rwsem_read, &vdev->memory_lock) + ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); } -out_unlock: - up_read(&vdev->memory_lock); -out: dev_dbg_ratelimited(&vdev->pdev->dev, "%s(,order = %d) BAR %ld page offset 0x%lx: 0x%x\n", __func__, order, @@ -1749,18 +1749,9 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. */ - if (!vdev->barmap[index]) { - ret = pci_request_selected_regions(pdev, - 1 << index, "vfio-pci"); - if (ret) - return ret; - - vdev->barmap[index] = pci_iomap(pdev, index, 0); - if (!vdev->barmap[index]) { - pci_release_selected_regions(pdev, 1 << index); - return -ENOMEM; - } - } + ret = vfio_pci_core_setup_barmap(vdev, index); + if (ret) + return ret; vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index a9972eacb2936..7b1776bae8026 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -60,7 +60,6 @@ void vfio_config_free(struct vfio_pci_core_device *vdev); int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state); -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f389dcfb230ad..715368076a1fe 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1366,7 +1366,7 @@ const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -struct vfio_device *vfio_device_from_file(struct file *file) +static struct vfio_device *vfio_device_from_file(struct file *file) { struct vfio_device_file *df = file->private_data; @@ -1374,7 +1374,6 @@ struct vfio_device *vfio_device_from_file(struct file *file) return NULL; return df->device; } -EXPORT_SYMBOL_GPL(vfio_device_from_file); /** * vfio_file_is_valid - True if the file is valid vfio file diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h index 9a579960972aa..d333dcdbeae70 100644 --- a/include/linux/memory-failure.h +++ b/include/linux/memory-failure.h @@ -4,19 +4,25 @@ #include -struct pfn_address_space; - -struct pfn_address_space_ops { - void (*failure)(struct pfn_address_space *pfn_space, unsigned long pfn); -}; - struct pfn_address_space { struct interval_tree_node node; - const struct pfn_address_space_ops *ops; struct address_space *mapping; + int (*pfn_to_vma_pgoff)(struct vm_area_struct *vma, + unsigned long pfn, pgoff_t *pgoff); }; +#ifdef CONFIG_MEMORY_FAILURE int register_pfn_address_space(struct pfn_address_space *pfn_space); void unregister_pfn_address_space(struct pfn_address_space *pfn_space); +#else +static inline int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + return -EOPNOTSUPP; +} + +static inline void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ +} +#endif /* CONFIG_MEMORY_FAILURE */ #endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2a..6db13f66b5e4b 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -119,6 +119,9 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, unsigned long pfn, + unsigned int order); int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); @@ -134,6 +137,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite); +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, loff_t *buf_offset, @@ -161,4 +165,14 @@ VFIO_IOREAD_DECLARATION(32) VFIO_IOREAD_DECLARATION(64) #endif +static inline bool is_aligned_for_order(struct vm_area_struct *vma, + unsigned long addr, + unsigned long pfn, + unsigned int order) +{ + return !(order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + !IS_ALIGNED(pfn, 1 << order))); +} + #endif /* VFIO_PCI_CORE_H */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 989c869685765..fecfeb7c8be7f 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,7 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ - EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f0f0d49d25443..80b60ae15ae8b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -179,6 +179,7 @@ struct kvm_xen_exit { #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 +#define KVM_EXIT_ARM_SEA 41 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -473,6 +474,14 @@ struct kvm_run { } setup_event_notify; }; } tdx; + /* KVM_EXIT_ARM_SEA */ + struct { +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; /* Fix the size of the union. */ char padding[256]; }; @@ -962,6 +971,8 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 +#define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_ARM_SEA_TO_USER 245 struct kvm_irq_routing_irqchip { __u32 irqchip; diff --git a/mm/Kconfig b/mm/Kconfig index 0b07219390b98..4402a404691ee 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -777,8 +777,8 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select MEMORY_ISOLATION - select INTERVAL_TREE select RAS + select INTERVAL_TREE help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/gup.c b/mm/gup.c index 2ad7a852ea57f..0bc4d140fc07f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1619,7 +1619,7 @@ int fixup_user_fault(struct mm_struct *mm, } if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, FOLL_HWPOISON); + int err = vm_fault_to_errno(ret, 0); if (err) return err; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ad79e76980184..8f95ca4125f9b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -446,22 +446,13 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ -#define FSDAX_INVALID_PGOFF ULONG_MAX - /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * - * Notice: @pgoff is used when: - * a. @p is a fsdax page and a filesystem with a memory failure handler - * has claimed the memory_failure event. - * b. pgoff is not backed by struct page. - * In all other cases, page->index and page->mapping are sufficient - * for mapping the page back to its corresponding user virtual address. */ static void __add_to_kill(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long ksm_addr, pgoff_t pgoff) + unsigned long addr) { struct to_kill *tk; @@ -471,20 +462,11 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p, return; } - /* Check for pgoff not backed by struct page */ - if (!(pfn_valid(pgoff)) && (vma->vm_flags & VM_PFNMAP)) { - tk->addr = vma_address(vma, pgoff, 1); - tk->size_shift = PAGE_SHIFT; - } else { - tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(page_folio(p), p, vma); - if (is_zone_device_page(p)) { - if (pgoff != FSDAX_INVALID_PGOFF) - tk->addr = vma_address(vma, pgoff, 1); - tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - } else { - tk->size_shift = folio_shift(page_folio(p)); - } - } + tk->addr = addr; + if (is_zone_device_page(p)) + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + else + tk->size_shift = folio_shift(page_folio(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -497,8 +479,8 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p, * has a mapping for the page. */ if (tk->addr == -EFAULT) { - pr_info("Unable to find address %lx in %s\n", - pfn_valid(pgoff) ? page_to_pfn(p) : pgoff, tsk->comm); + pr_info("Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); } else if (tk->size_shift == 0) { kfree(tk); return; @@ -515,7 +497,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p, { if (addr == -EFAULT) return; - __add_to_kill(tsk, p, vma, to_kill, addr, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, addr); } #ifdef CONFIG_KSM @@ -537,7 +519,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, addr, FSDAX_INVALID_PGOFF); + __add_to_kill(tsk, p, vma, to_kill, addr); } #endif /* @@ -704,21 +686,21 @@ static void collect_procs_file(const struct folio *folio, i_mmap_unlock_read(mapping); } -static void add_to_kill_pgoff(struct task_struct *tsk, const struct page *p, +#ifdef CONFIG_FS_DAX +static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { unsigned long addr = vma_address(vma, pgoff, 1); - __add_to_kill(tsk, p, vma, to_kill, addr, pgoff); + __add_to_kill(tsk, p, vma, to_kill, addr); } /* - * Collect processes when the error hit a fsdax page or a PFN not backed by - * struct page. + * Collect processes when the error hit a fsdax page. */ -static void collect_procs_pgoff(const struct page *page, - struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill, bool pre_remove) +static void collect_procs_fsdax(const struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill, bool pre_remove) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -739,12 +721,13 @@ static void collect_procs_pgoff(const struct page *page, continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_mm == t->mm) - add_to_kill_pgoff(t, page, vma, to_kill, pgoff); + add_to_kill_fsdax(t, page, vma, to_kill, pgoff); } } rcu_read_unlock(); i_mmap_unlock_read(mapping); } +#endif /* CONFIG_FS_DAX */ /* * Collect the processes who have the corrupted page mapped to kill. @@ -979,7 +962,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned page", - [MF_MSG_PFN_MAP] = "non struct page pfn", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1372,10 +1355,9 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - if (type != MF_MSG_ALREADY_POISONED) { + if (type != MF_MSG_ALREADY_POISONED && type != MF_MSG_PFN_MAP) { num_poisoned_pages_inc(pfn); - if (type != MF_MSG_PFN_MAP) - update_per_node_mf_stats(pfn, result); + update_per_node_mf_stats(pfn, result); } pr_err("%#lx: recovery action for %s: %s\n", @@ -1889,7 +1871,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, * The pre_remove case is revoking access, the memory is still * good and could theoretically be put back into service. */ - collect_procs_pgoff(page, mapping, index, &to_kill, pre_remove); + collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: @@ -2242,16 +2224,17 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, int register_pfn_address_space(struct pfn_address_space *pfn_space) { - if (!pfn_space) + guard(mutex)(&pfn_space_lock); + + if (!pfn_space->pfn_to_vma_pgoff) return -EINVAL; - if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, - (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, "")) + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) return -EBUSY; - mutex_lock(&pfn_space_lock); interval_tree_insert(&pfn_space->node, &pfn_space_itree); - mutex_unlock(&pfn_space_lock); return 0; } @@ -2259,51 +2242,108 @@ EXPORT_SYMBOL_GPL(register_pfn_address_space); void unregister_pfn_address_space(struct pfn_address_space *pfn_space) { - if (!pfn_space) - return; + guard(mutex)(&pfn_space_lock); - mutex_lock(&pfn_space_lock); - interval_tree_remove(&pfn_space->node, &pfn_space_itree); - mutex_unlock(&pfn_space_lock); - release_mem_region(pfn_space->node.start << PAGE_SHIFT, - (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT); + if (interval_tree_iter_first(&pfn_space_itree, + pfn_space->node.start, + pfn_space->node.last)) + interval_tree_remove(&pfn_space->node, &pfn_space_itree); } EXPORT_SYMBOL_GPL(unregister_pfn_address_space); +static void add_to_kill_pgoff(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + pgoff_t pgoff) +{ + struct to_kill *tk; + + tk = kmalloc(sizeof(*tk), GFP_ATOMIC); + if (!tk) { + pr_info("Unable to kill proc %d\n", tsk->pid); + return; + } + + /* Check for pgoff not backed by struct page */ + tk->addr = vma_address(vma, pgoff, 1); + tk->size_shift = PAGE_SHIFT; + + if (tk->addr == -EFAULT) + pr_info("Unable to find address %lx in %s\n", + pgoff, tsk->comm); + + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Collect processes when the error hit a PFN not backed by struct page. + */ +static void collect_procs_pfn(struct pfn_address_space *pfn_space, + unsigned long pfn, struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct address_space *mapping = pfn_space->mapping; + + i_mmap_lock_read(mapping); + rcu_read_lock(); + for_each_process(tsk) { + struct task_struct *t = tsk; + + t = task_early_kill(tsk, true); + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) { + pgoff_t pgoff; + + if (vma->vm_mm == t->mm && + !pfn_space->pfn_to_vma_pgoff(vma, pfn, &pgoff)) + add_to_kill_pgoff(t, vma, to_kill, pgoff); + } + } + rcu_read_unlock(); + i_mmap_unlock_read(mapping); +} + +/** + * memory_failure_pfn - Handle memory failure on a page not backed by + * struct page. + * @pfn: Page Number of the corrupted page + * @flags: fine tune action taken + * + * Return: + * 0 - success, + * -EBUSY - Page PFN does not belong to any address space mapping. + */ static int memory_failure_pfn(unsigned long pfn, int flags) { struct interval_tree_node *node; - int res = MF_FAILED; LIST_HEAD(tokill); - mutex_lock(&pfn_space_lock); - /* - * Modules registers with MM the address space mapping to the device memory they - * manage. Iterate to identify exactly which address space has mapped to this - * failing PFN. - */ - for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; - node = interval_tree_iter_next(node, pfn, pfn)) { - struct pfn_address_space *pfn_space = - container_of(node, struct pfn_address_space, node); + scoped_guard(mutex, &pfn_space_lock) { + bool mf_handled = false; + /* - * Modules managing the device memory need to be conveyed about the - * memory failure so that the poisoned PFN can be tracked. + * Modules registers with MM the address space mapping to + * the device memory they manage. Iterate to identify + * exactly which address space has mapped to this failing + * PFN. */ - if (pfn_space->ops) - pfn_space->ops->failure(pfn_space, pfn); + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); - collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill, false); + collect_procs_pfn(pfn_space, pfn, &tokill); - unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, - PAGE_SIZE, 0); + mf_handled = true; + } - res = MF_RECOVERED; + if (!mf_handled) + return action_result(pfn, MF_MSG_PFN_MAP, MF_IGNORED); } - mutex_unlock(&pfn_space_lock); - - if (res == MF_FAILED) - return action_result(pfn, MF_MSG_PFN_MAP, res); /* * Unlike System-RAM there is no possibility to swap in a different @@ -2312,6 +2352,7 @@ static int memory_failure_pfn(unsigned long pfn, int flags) * MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + kill_procs(&tokill, true, pfn, flags); return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); @@ -2360,17 +2401,20 @@ int memory_failure(unsigned long pfn, int flags) if (!(flags & MF_SW_SIMULATED)) hw_memory_failure = true; - if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { - res = memory_failure_pfn(pfn, flags); - goto unlock_mutex; - } - p = pfn_to_online_page(pfn); if (!p) { res = arch_memory_failure(pfn, flags); if (res == 0) goto unlock_mutex; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + /* + * The PFN is not backed by struct page. + */ + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); put_ref_page(pfn, flags); diff --git a/tools/arch/arm64/include/asm/esr.h b/tools/arch/arm64/include/asm/esr.h index bd592ca815711..0fa17b3af1f78 100644 --- a/tools/arch/arm64/include/asm/esr.h +++ b/tools/arch/arm64/include/asm/esr.h @@ -141,6 +141,8 @@ #define ESR_ELx_SF (UL(1) << ESR_ELx_SF_SHIFT) #define ESR_ELx_AR_SHIFT (14) #define ESR_ELx_AR (UL(1) << ESR_ELx_AR_SHIFT) +#define ESR_ELx_VNCR_SHIFT (13) +#define ESR_ELx_VNCR (UL(1) << ESR_ELx_VNCR_SHIFT) #define ESR_ELx_CM_SHIFT (8) #define ESR_ELx_CM (UL(1) << ESR_ELx_CM_SHIFT) diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 41b40c676d7f3..f1d6617f99c7b 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -161,6 +161,7 @@ TEST_GEN_PROGS_arm64 += arm64/hypercalls TEST_GEN_PROGS_arm64 += arm64/external_aborts TEST_GEN_PROGS_arm64 += arm64/page_fault_test TEST_GEN_PROGS_arm64 += arm64/psci_test +TEST_GEN_PROGS_arm64 += arm64/sea_to_user TEST_GEN_PROGS_arm64 += arm64/set_id_regs TEST_GEN_PROGS_arm64 += arm64/smccc_filter TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config diff --git a/tools/testing/selftests/kvm/arm64/sea_to_user.c b/tools/testing/selftests/kvm/arm64/sea_to_user.c new file mode 100644 index 0000000000000..573dd790aeb8e --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/sea_to_user.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test KVM returns to userspace with KVM_EXIT_ARM_SEA if host APEI fails + * to handle SEA and userspace has opt-ed in KVM_CAP_ARM_SEA_TO_USER. + * + * After reaching userspace with expected arm_sea info, also test userspace + * injecting a synchronous external data abort into the guest. + * + * This test utilizes EINJ to generate a REAL synchronous external data + * abort by consuming a recoverable uncorrectable memory error. Therefore + * the device under test must support EINJ in both firmware and host kernel, + * including the notrigger feature. Otherwise the test will be skipped. + * The under-test platform's APEI should be unable to claim SEA. Otherwise + * the test will also be skipped. + */ + +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define PAGE_PRESENT (1ULL << 63) +#define PAGE_PHYSICAL 0x007fffffffffffffULL +#define PAGE_ADDR_MASK (~(0xfffULL)) + +/* Group ISV and ISS[23:14]. */ +#define ESR_ELx_INST_SYNDROME ((ESR_ELx_ISV) | (ESR_ELx_SAS) | \ + (ESR_ELx_SSE) | (ESR_ELx_SRT_MASK) | \ + (ESR_ELx_SF) | (ESR_ELx_AR)) + +#define EINJ_ETYPE "/sys/kernel/debug/apei/einj/error_type" +#define EINJ_ADDR "/sys/kernel/debug/apei/einj/param1" +#define EINJ_MASK "/sys/kernel/debug/apei/einj/param2" +#define EINJ_FLAGS "/sys/kernel/debug/apei/einj/flags" +#define EINJ_NOTRIGGER "/sys/kernel/debug/apei/einj/notrigger" +#define EINJ_DOIT "/sys/kernel/debug/apei/einj/error_inject" +/* Memory Uncorrectable non-fatal. */ +#define ERROR_TYPE_MEMORY_UER 0x10 +/* Memory address and mask valid (param1 and param2). */ +#define MASK_MEMORY_UER 0b10 + +/* Guest virtual address region = [2G, 3G). */ +#define START_GVA 0x80000000UL +#define VM_MEM_SIZE 0x40000000UL +/* Note: EINJ_OFFSET must < VM_MEM_SIZE. */ +#define EINJ_OFFSET 0x01234badUL +#define EINJ_GVA ((START_GVA) + (EINJ_OFFSET)) + +static vm_paddr_t einj_gpa; +static void *einj_hva; +static uint64_t einj_hpa; +static bool far_invalid; + +static uint64_t translate_to_host_paddr(unsigned long vaddr) +{ + uint64_t pinfo; + int64_t offset = vaddr / getpagesize() * sizeof(pinfo); + int fd; + uint64_t page_addr; + uint64_t paddr; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + ksft_exit_fail_perror("Failed to open /proc/self/pagemap"); + if (pread(fd, &pinfo, sizeof(pinfo), offset) != sizeof(pinfo)) { + close(fd); + ksft_exit_fail_perror("Failed to read /proc/self/pagemap"); + } + + close(fd); + + if ((pinfo & PAGE_PRESENT) == 0) + ksft_exit_fail_perror("Page not present"); + + page_addr = (pinfo & PAGE_PHYSICAL) << MIN_PAGE_SHIFT; + paddr = page_addr + (vaddr & (getpagesize() - 1)); + return paddr; +} + +static void write_einj_entry(const char *einj_path, uint64_t val) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + sprintf(cmd, "echo %#lx > %s", val, einj_path); + cmdfile = popen(cmd, "r"); + + if (pclose(cmdfile) == 0) + ksft_print_msg("echo %#lx > %s - done\n", val, einj_path); + else + ksft_exit_fail_perror("Failed to write EINJ entry"); +} + +static void inject_uer(uint64_t paddr) +{ + if (access("/sys/firmware/acpi/tables/EINJ", R_OK) == -1) + ksft_test_result_skip("EINJ table no available in firmware"); + + if (access(EINJ_ETYPE, R_OK | W_OK) == -1) + ksft_test_result_skip("EINJ module probably not loaded?"); + + write_einj_entry(EINJ_ETYPE, ERROR_TYPE_MEMORY_UER); + write_einj_entry(EINJ_FLAGS, MASK_MEMORY_UER); + write_einj_entry(EINJ_ADDR, paddr); + write_einj_entry(EINJ_MASK, ~0x0UL); + write_einj_entry(EINJ_NOTRIGGER, 1); + write_einj_entry(EINJ_DOIT, 1); +} + +/* + * When host APEI successfully claims the SEA caused by guest_code, kernel + * will send SIGBUS signal with BUS_MCEERR_AR to test thread. + * + * We set up this SIGBUS handler to skip the test for that case. + */ +static void sigbus_signal_handler(int sig, siginfo_t *si, void *v) +{ + ksft_print_msg("SIGBUS (%d) received, dumping siginfo...\n", sig); + ksft_print_msg("si_signo=%d, si_errno=%d, si_code=%d, si_addr=%p\n", + si->si_signo, si->si_errno, si->si_code, si->si_addr); + if (si->si_code == BUS_MCEERR_AR) + ksft_test_result_skip("SEA is claimed by host APEI\n"); + else + ksft_test_result_fail("Exit with signal unhandled\n"); + + exit(0); +} + +static void setup_sigbus_handler(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = sigbus_signal_handler; + act.sa_flags = SA_SIGINFO; + TEST_ASSERT(sigaction(SIGBUS, &act, NULL) == 0, + "Failed to setup SIGBUS handler"); +} + +static void guest_code(void) +{ + uint64_t guest_data; + + /* Consumes error will cause a SEA. */ + guest_data = *(uint64_t *)EINJ_GVA; + + GUEST_FAIL("Poison not protected by SEA: gva=%#lx, guest_data=%#lx\n", + EINJ_GVA, guest_data); +} + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + u64 far = read_sysreg(far_el1); + bool expect_far_invalid = far_invalid; + + GUEST_PRINTF("Handling Guest SEA\n"); + GUEST_PRINTF("ESR_EL1=%#lx, FAR_EL1=%#lx\n", esr, far); + + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + if (expect_far_invalid) { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, ESR_ELx_FnV); + GUEST_PRINTF("Guest observed garbage value in FAR\n"); + } else { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, 0); + GUEST_ASSERT_EQ(far, EINJ_GVA); + } + + GUEST_DONE(); +} + +static void vcpu_inject_sea(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void run_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + bool guest_done = false; + struct kvm_run *run = vcpu->run; + u64 esr; + + /* Resume the vCPU after error injection to consume the error. */ + vcpu_run(vcpu); + + ksft_print_msg("Dump kvm_run info about KVM_EXIT_%s\n", + exit_reason_str(run->exit_reason)); + ksft_print_msg("kvm_run.arm_sea: esr=%#llx, flags=%#llx\n", + run->arm_sea.esr, run->arm_sea.flags); + ksft_print_msg("kvm_run.arm_sea: gva=%#llx, gpa=%#llx\n", + run->arm_sea.gva, run->arm_sea.gpa); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_SEA); + + esr = run->arm_sea.esr; + TEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_LOW); + TEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + TEST_ASSERT_EQ(ESR_ELx_ISS2(esr), 0); + TEST_ASSERT_EQ((esr & ESR_ELx_INST_SYNDROME), 0); + TEST_ASSERT_EQ(esr & ESR_ELx_VNCR, 0); + + if (!(esr & ESR_ELx_FnV)) { + ksft_print_msg("Expect gva to match given FnV bit is 0\n"); + TEST_ASSERT_EQ(run->arm_sea.gva, EINJ_GVA); + } + + if (run->arm_sea.flags & KVM_EXIT_ARM_SEA_FLAG_GPA_VALID) { + ksft_print_msg("Expect gpa to match given KVM_EXIT_ARM_SEA_FLAG_GPA_VALID is set\n"); + TEST_ASSERT_EQ(run->arm_sea.gpa, einj_gpa & PAGE_ADDR_MASK); + } + + far_invalid = esr & ESR_ELx_FnV; + + /* Inject a SEA into guest and expect handled in SEA handler. */ + vcpu_inject_sea(vcpu); + + /* Expect the guest to reach GUEST_DONE gracefully. */ + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_PRINTF: + ksft_print_msg("From guest: %s", uc.buffer); + break; + case UCALL_DONE: + ksft_print_msg("Guest done gracefully!\n"); + guest_done = 1; + break; + case UCALL_ABORT: + ksft_print_msg("Guest aborted!\n"); + guest_done = 1; + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + } + } while (!guest_done); +} + +static struct kvm_vm *vm_create_with_sea_handler(struct kvm_vcpu **vcpu) +{ + size_t backing_page_size; + size_t guest_page_size; + size_t alignment; + uint64_t num_guest_pages; + vm_paddr_t start_gpa; + enum vm_mem_backing_src_type src_type = VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB; + struct kvm_vm *vm; + + backing_page_size = get_backing_src_pagesz(src_type); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + alignment = max(backing_page_size, guest_page_size); + num_guest_pages = VM_MEM_SIZE / guest_page_size; + + vm = __vm_create_with_one_vcpu(vcpu, num_guest_pages, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + + vm_install_sync_handler(vm, + /*vector=*/VECTOR_SYNC_CURRENT, + /*ec=*/ESR_ELx_EC_DABT_CUR, + /*handler=*/expect_sea_handler); + + start_gpa = (vm->max_gfn - num_guest_pages) * guest_page_size; + start_gpa = align_down(start_gpa, alignment); + + vm_userspace_mem_region_add( + /*vm=*/vm, + /*src_type=*/src_type, + /*guest_paddr=*/start_gpa, + /*slot=*/1, + /*npages=*/num_guest_pages, + /*flags=*/0); + + virt_map(vm, START_GVA, start_gpa, num_guest_pages); + + ksft_print_msg("Mapped %#lx pages: gva=%#lx to gpa=%#lx\n", + num_guest_pages, START_GVA, start_gpa); + return vm; +} + +static void vm_inject_memory_uer(struct kvm_vm *vm) +{ + uint64_t guest_data; + + einj_gpa = addr_gva2gpa(vm, EINJ_GVA); + einj_hva = addr_gva2hva(vm, EINJ_GVA); + + /* Populate certain data before injecting UER. */ + *(uint64_t *)einj_hva = 0xBAADCAFE; + guest_data = *(uint64_t *)einj_hva; + ksft_print_msg("Before EINJect: data=%#lx\n", + guest_data); + + einj_hpa = translate_to_host_paddr((unsigned long)einj_hva); + + ksft_print_msg("EINJ_GVA=%#lx, einj_gpa=%#lx, einj_hva=%p, einj_hpa=%#lx\n", + EINJ_GVA, einj_gpa, einj_hva, einj_hpa); + + inject_uer(einj_hpa); + ksft_print_msg("Memory UER EINJected\n"); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SEA_TO_USER)); + + setup_sigbus_handler(); + + vm = vm_create_with_sea_handler(&vcpu); + vm_enable_cap(vm, KVM_CAP_ARM_SEA_TO_USER, 0); + vm_inject_memory_uer(vm); + run_vm(vm, vcpu); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index c3f5142b0a541..4aec84acc42f1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -2066,6 +2066,7 @@ static struct exit_reason { KVM_EXIT_STRING(NOTIFY), KVM_EXIT_STRING(LOONGARCH_IOCSR), KVM_EXIT_STRING(MEMORY_FAULT), + KVM_EXIT_STRING(ARM_SEA), }; /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c2de2931c0f44..6c07dd423458c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3011,11 +3011,8 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; - if (r < 0) { + if (r < 0) pfn = KVM_PFN_ERR_FAULT; - if (r == -EHWPOISON) - pfn = KVM_PFN_ERR_HWPOISON; - } } else { if ((kfp->flags & FOLL_NOWAIT) && vma_is_valid(vma, kfp->flags & FOLL_WRITE))