From 766877d881506d35a3bddbc80d8b5a43ae406b86 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:31 +0200 Subject: [PATCH 001/147] Revert "x86/module: prepare module loading for ROX allocations of text" The module code does not create a writable copy of the executable memory anymore so there is no need to handle it in module relocation and alternatives patching. This reverts commit 9bfc4824fd4836c16bb44f922bfaffba5da3e4f3. Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-8-rppt@kernel.org (cherry picked from commit 1d7e707af446134dd272ea8a89018c63cc17bb6a) Signed-off-by: Nirmoy Das --- arch/um/kernel/um_arch.c | 11 +- arch/x86/entry/vdso/vma.c | 3 +- arch/x86/include/asm/alternative.h | 14 +-- arch/x86/kernel/alternative.c | 181 ++++++++++++----------------- arch/x86/kernel/ftrace.c | 30 +++-- arch/x86/kernel/module.c | 45 +++---- 6 files changed, 117 insertions(+), 167 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797ec..8be91974e786d 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -440,25 +440,24 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void apply_seal_endbr(s32 *start, s32 *end) { } -void apply_retpolines(s32 *start, s32 *end, struct module *mod) +void apply_retpolines(s32 *start, s32 *end) { } -void apply_returns(s32 *start, s32 *end, struct module *mod) +void apply_returns(s32 *start, s32 *end) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index aa62949335ece..77777fc78d0b4 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -48,8 +48,7 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len), - NULL); + image->alt_len)); return 0; } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e3903b731305c..a2141665239b5 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -87,16 +87,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; -struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod); -extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); -extern void apply_returns(s32 *start, s32 *end, struct module *mod); -extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); +extern void apply_returns(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi, struct module *mod); + s32 *start_cfi, s32 *end_cfi); + +struct module; struct callthunk_sites { s32 *call_start, *call_end; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c71b575bf2292..8b66a555d2f03 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,10 +392,8 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, - struct module *mod) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { - u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -405,14 +403,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, } if (a->instrlen != 6 || - wr_instr[0] != CALL_RIP_REL_OPCODE || - wr_instr[1] != CALL_RIP_REL_MODRM) { + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(wr_instr + 2); + disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -450,8 +448,7 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end, - struct module *mod) + struct alt_instr *end) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -480,7 +477,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -494,11 +490,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); - wr_instr = module_writable_address(mod, instr); - replacement = (u8 *)&a->repl_offset + a->repl_offset; - wr_replacement = module_writable_address(mod, replacement); - BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -509,9 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, wr_instr, a->instrlen); + memcpy(insn_buff, instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(wr_instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); continue; } @@ -521,12 +513,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, wr_replacement, a->replacementlen); + memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a, - mod); + insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } @@ -536,11 +527,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(wr_instr, insn_buff, insn_buff_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -731,20 +722,18 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -772,9 +761,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } @@ -810,8 +799,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; @@ -820,13 +808,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -846,35 +833,32 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr); +static void poison_cfi(void *addr); -static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) +static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); - if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) return; if (!is_endbr(endbr)) { @@ -889,7 +873,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(wr_addr, &poison, 4); + text_poke_early(addr, &poison, 4); } /* @@ -898,23 +882,22 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, wr_addr, true); + poison_endbr(addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16, wr_addr - 16); + poison_cfi(addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1136,7 +1119,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1148,23 +1131,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, jmp, 2); + text_poke_early(addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1174,115 +1154,106 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, mov, 2); + text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(wr_addr + 1, &hash, 4); + text_poke_early(addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) +static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr + 16, wr_addr + 16, false); + poison_endbr(addr+16, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(wr_addr + 2, &hash, 4); + text_poke_early(addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { - text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1291,9 +1262,8 @@ static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { - bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1311,7 +1281,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1322,11 +1292,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi, mod); + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } @@ -1338,7 +1308,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1348,17 +1318,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); + ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi, mod); + cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1377,7 +1347,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr, void *wr_addr) +static void poison_cfi(void *addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1389,8 +1359,8 @@ static void poison_cfi(void *addr, void *wr_addr) * ud2 * 1: nop */ - poison_endbr(addr, wr_addr, false); - poison_hash(wr_addr + fineibt_preamble_hash); + poison_endbr(addr, false); + poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1399,7 +1369,7 @@ static void poison_cfi(void *addr, void *wr_addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(wr_addr + 1); + poison_hash(addr + 1); break; default: @@ -1410,21 +1380,22 @@ static void poison_cfi(void *addr, void *wr_addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr) { } +static void poison_cfi(void *addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, mod); + start_cfi, end_cfi, + /* .builtin = */ false); } #ifdef CONFIG_SMP @@ -1721,16 +1692,16 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, NULL); + __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); - apply_returns(__return_sites, __return_sites_end, NULL); + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); - apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); + apply_alternatives(__alt_instructions, __alt_instructions_end); /* * Now all calls are established. Apply the call thunks if @@ -1741,7 +1712,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 166bc0ea3bdff..cace6e8d7cc77 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) { + if (ftrace_poke_late) text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - } else { - mutex_lock(&text_mutex); - text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); - mutex_unlock(&text_mutex); - } + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } @@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - void *ret; + int ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = text_poke_copy(trampoline, (void *)start_offset, size); - if (WARN_ON(!ret)) + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - text_poke_copy(ip, retq, sizeof(retq)); + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - if (!text_poke_copy(ip, x86_nops[2], 2)) + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) goto fail; } @@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - text_poke_copy(ptr, &ops, sizeof(unsigned long)); + *ptr = (unsigned long)ops; op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - text_poke_copy_locked(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE, false); + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8984abd91c001..837450b6e882f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -146,21 +146,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - void *wr_loc = module_writable_address(me, loc); - - if (memcmp(wr_loc, &zero, size)) { + if (memcmp(loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(wr_loc, &val, size); + write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -227,7 +224,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -236,6 +233,8 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -266,20 +265,20 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size, me); + apply_retpolines(rseg, rseg + retpolines->sh_size); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size, me); + apply_returns(rseg, rseg + returns->sh_size); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size, me); + apply_alternatives(aseg, aseg + alt->sh_size); } if (calls || alt) { struct callthunk_sites cs = {}; @@ -298,28 +297,8 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } - - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - - return 0; -} - -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *locks = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - } - if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -329,6 +308,10 @@ int module_post_finalize(const Elf_Ehdr *hdr, text, text_end); } + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } From 1b96a80d833a432d4160d0ab729de38cd0590950 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Feb 2025 13:15:31 +0100 Subject: [PATCH 002/147] x86/ibt: Clean up is_endbr() Pretty much every caller of is_endbr() actually wants to test something at an address and ends up doing get_kernel_nofault(). Fold the lot into a more convenient helper. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Acked-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: "Masami Hiramatsu (Google)" Link: https://lore.kernel.org/r/20250207122546.181367417@infradead.org (cherry picked from commit 72e213a7ccf9dc78a85eecee8dc8170762ed876c) Signed-off-by: Nirmoy Das --- arch/x86/events/core.c | 2 +- arch/x86/include/asm/ftrace.h | 16 ++-------------- arch/x86/include/asm/ibt.h | 5 +++-- arch/x86/kernel/alternative.c | 20 ++++++++++++++------ arch/x86/kernel/kprobes/core.c | 11 +---------- arch/x86/net/bpf_jit_comp.c | 4 ++-- kernel/trace/bpf_trace.c | 21 ++++----------------- 7 files changed, 27 insertions(+), 52 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a46b792a171cb..099b63eb178d7 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2845,7 +2845,7 @@ static bool is_uprobe_at_func_entry(struct pt_regs *regs) return true; /* endbr64 (64-bit only) */ - if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn)) + if (user_64bit_mode(regs) && is_endbr((u32 *)auprobe->insn)) return true; return false; diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f9cb4d07df58f..f2265246249ad 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -36,21 +36,9 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) static inline unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) { -#ifdef CONFIG_X86_KERNEL_IBT - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) + if (is_endbr((void*)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; -#endif + return fentry_ip; } #define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index 1e59581d500ca..d955e0d1cbf15 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -65,7 +65,7 @@ static inline __attribute_const__ u32 gen_endbr_poison(void) return 0x001f0f66; /* osp nopl (%rax) */ } -static inline bool is_endbr(u32 val) +static inline bool __is_endbr(u32 val) { if (val == gen_endbr_poison()) return true; @@ -74,6 +74,7 @@ static inline bool is_endbr(u32 val) return val == gen_endbr(); } +extern __noendbr bool is_endbr(u32 *val); extern __noendbr u64 ibt_save(bool disable); extern __noendbr void ibt_restore(u64 save); @@ -98,7 +99,7 @@ extern __noendbr void ibt_restore(u64 save); #define __noendbr -static inline bool is_endbr(u32 val) { return false; } +static inline bool is_endbr(u32 *val) { return false; } static inline u64 ibt_save(bool disable) { return 0; } static inline void ibt_restore(u64 save) { } diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 8b66a555d2f03..9a252bb0d34b8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -852,16 +852,24 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +__noendbr bool is_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return __is_endbr(endbr); + +Efault: + return false; +} + static void poison_cfi(void *addr); static void __init_or_module poison_endbr(void *addr, bool warn) { - u32 endbr, poison = gen_endbr_poison(); - - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) - return; + u32 poison = gen_endbr_poison(); - if (!is_endbr(endbr)) { + if (!is_endbr(addr)) { WARN_ON_ONCE(warn); return; } @@ -988,7 +996,7 @@ static u32 cfi_seed __ro_after_init; static u32 cfi_rehash(u32 hash) { hash ^= cfi_seed; - while (unlikely(is_endbr(hash) || is_endbr(-hash))) { + while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { bool lsb = hash & 1; hash >>= 1; if (lsb) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 72e6a45e7ec24..09608fd936876 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -373,16 +373,7 @@ static bool can_probe(unsigned long paddr) kprobe_opcode_t *arch_adjust_kprobe_addr(unsigned long addr, unsigned long offset, bool *on_func_entry) { - u32 insn; - - /* - * Since 'addr' is not guaranteed to be safe to access, use - * copy_from_kernel_nofault() to read the instruction: - */ - if (copy_from_kernel_nofault(&insn, (void *)addr, sizeof(u32))) - return NULL; - - if (is_endbr(insn)) { + if (is_endbr((u32 *)addr)) { *on_func_entry = !offset || offset == 4; if (*on_func_entry) offset = 4; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a43fc5af973d2..f36508b67278a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -641,7 +641,7 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, * See emit_prologue(), for IBT builds the trampoline hook is preceded * with an ENDBR instruction. */ - if (is_endbr(*(u32 *)ip)) + if (is_endbr(ip)) ip += ENDBR_INSN_SIZE; return __bpf_arch_text_poke(ip, t, old_addr, new_addr); @@ -3036,7 +3036,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im /* skip patched call instruction and point orig_call to actual * body of the kernel function. */ - if (is_endbr(*(u32 *)orig_call)) + if (is_endbr(orig_call)) orig_call += ENDBR_INSN_SIZE; orig_call += X86_PATCH_SIZE; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 13bef2462e94b..05bedb0f1919d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1039,27 +1039,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { .arg1_type = ARG_PTR_TO_CTX, }; -#ifdef CONFIG_X86_KERNEL_IBT -static unsigned long get_entry_ip(unsigned long fentry_ip) +static inline unsigned long get_entry_ip(unsigned long fentry_ip) { - u32 instr; - - /* We want to be extra safe in case entry ip is on the page edge, - * but otherwise we need to avoid get_kernel_nofault()'s overhead. - */ - if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) { - if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE))) - return fentry_ip; - } else { - instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE); - } - if (is_endbr(instr)) +#ifdef CONFIG_X86_KERNEL_IBT + if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE))) fentry_ip -= ENDBR_INSN_SIZE; +#endif return fentry_ip; } -#else -#define get_entry_ip(fentry_ip) fentry_ip -#endif BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs) { From 301e3ae188f73304fb30e1a17038b32cb13dd696 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 11 Jul 2025 07:18:00 -0700 Subject: [PATCH 003/147] Revert "NVIDIA: SAUCE: iommu/arm-smmu-v3: Implement arm_smmu_get_msi_mapping_domain" This reverts commit 78480b2f86e004a5c3cf2dc772bf37e950fdb935. Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index b425baef968b9..34a0be59cd919 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -30,15 +30,6 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) return info; } -static struct iommu_domain * -arm_smmu_get_msi_mapping_domain(struct iommu_domain *domain) -{ - struct arm_smmu_nested_domain *nested_domain = - container_of(domain, struct arm_smmu_nested_domain, domain); - - return &nested_domain->vsmmu->s2_parent->domain; -} - static void arm_smmu_make_nested_cd_table_ste( struct arm_smmu_ste *target, struct arm_smmu_master *master, struct arm_smmu_nested_domain *nested_domain, bool ats_enabled) @@ -145,7 +136,6 @@ static void arm_smmu_domain_nested_free(struct iommu_domain *domain) } static const struct iommu_domain_ops arm_smmu_nested_ops = { - .get_msi_mapping_domain = arm_smmu_get_msi_mapping_domain, .attach_dev = arm_smmu_attach_dev_nested, .free = arm_smmu_domain_nested_free, }; From fc78ff3ac6dbf8a9e22dab165a7feef1d3001d93 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 3 Feb 2025 21:00:55 -0800 Subject: [PATCH 004/147] iommufd/fault: Remove iommufd_fault_domain_attach/detach/replace_dev() There are new attach/detach/replace helpers in device.c taking care of both the attach_handle and the fault specific routines for iopf_enable/disable() and auto response. Clean up these redundant functions in the fault.c file. Link: https://patch.msgid.link/r/3ca94625e9d78270d9a715fa0809414fddd57e58.1738645017.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit dc10ba25d43f433ad5d9e8e6be4f4d2bb3cd9ddb) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/fault.c | 120 ------------------------ drivers/iommu/iommufd/iommufd_private.h | 8 -- 2 files changed, 128 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index cb844e6799d4f..931a3fbe6e32c 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -60,44 +60,6 @@ void iommufd_fault_iopf_disable(struct iommufd_device *idev) mutex_unlock(&idev->iopf_lock); } -static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - int ret; - - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, - &handle->handle); - if (ret) - kfree(handle); - - return ret; -} - -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - int ret; - - if (!hwpt->fault) - return -EINVAL; - - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - - ret = __fault_domain_attach_dev(hwpt, idev); - if (ret) - iommufd_fault_iopf_disable(idev); - - return ret; -} - void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle) { @@ -135,88 +97,6 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&fault->mutex); } -static struct iommufd_attach_handle * -iommufd_device_get_attach_handle(struct iommufd_device *idev) -{ - struct iommu_attach_handle *handle; - - handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); - if (IS_ERR(handle)) - return NULL; - - return to_iommufd_handle(handle); -} - -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) -{ - struct iommufd_attach_handle *handle; - - handle = iommufd_device_get_attach_handle(idev); - iommu_detach_group_handle(hwpt->domain, idev->igroup->group); - iommufd_auto_response_faults(hwpt, handle); - iommufd_fault_iopf_disable(idev); - kfree(handle); -} - -static int __fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - struct iommufd_attach_handle *handle, *curr = NULL; - int ret; - - if (old->fault) - curr = iommufd_device_get_attach_handle(idev); - - if (hwpt->fault) { - handle = kzalloc(sizeof(*handle), GFP_KERNEL); - if (!handle) - return -ENOMEM; - - handle->idev = idev; - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, &handle->handle); - } else { - ret = iommu_replace_group_handle(idev->igroup->group, - hwpt->domain, NULL); - } - - if (!ret && curr) { - iommufd_auto_response_faults(old, curr); - kfree(curr); - } - - return ret; -} - -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old) -{ - bool iopf_off = !hwpt->fault && old->fault; - bool iopf_on = hwpt->fault && !old->fault; - int ret; - - if (iopf_on) { - ret = iommufd_fault_iopf_enable(idev); - if (ret) - return ret; - } - - ret = __fault_domain_replace_dev(idev, hwpt, old); - if (ret) { - if (iopf_on) - iommufd_fault_iopf_disable(idev); - return ret; - } - - if (iopf_off) - iommufd_fault_iopf_disable(idev); - - return 0; -} - void iommufd_fault_destroy(struct iommufd_object *obj) { struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 02fe1ada97cc7..8e0e3ab647476 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -496,14 +496,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); void iommufd_fault_destroy(struct iommufd_object *obj); int iommufd_fault_iopf_handler(struct iopf_group *group); -int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); -int iommufd_fault_domain_replace_dev(struct iommufd_device *idev, - struct iommufd_hw_pagetable *hwpt, - struct iommufd_hw_pagetable *old); - int iommufd_fault_iopf_enable(struct iommufd_device *idev); void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, From 1d46c11beef39c7ce6e39dc62307cc9aab936712 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Fri, 11 Jul 2025 03:47:32 -0700 Subject: [PATCH 005/147] Revert "NVIDIA: SAUCE: iommu/dma: Support MSIs through nested domains" This reverts commit 8aced5e9f03459e87ad621c6ca69e413fdfa1ee4. Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 18 ++---------------- include/linux/iommu.h | 4 ---- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index f1580bd8a42a2..2a9fa0c8cc00f 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1800,20 +1800,6 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } -/* - * Nested domains may not have an MSI cookie or accept mappings, but they may - * be related to a domain which does, so we let them tell us what they need. - */ -static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) -{ - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - - if (domain && domain->type == IOMMU_DOMAIN_NESTED && - domain->ops && domain->ops->get_msi_mapping_domain) - domain = domain->ops->get_msi_mapping_domain(domain); - return domain; -} - /** * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain * @desc: MSI descriptor, will store the MSI page @@ -1824,7 +1810,7 @@ static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev) int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); struct iommu_dma_msi_page *msi_page; static DEFINE_MUTEX(msi_prepare_lock); /* see below */ @@ -1857,7 +1843,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev); + const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); const struct iommu_dma_msi_page *msi_page; msi_page = msi_desc_get_iommu_cookie(desc); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9058e0ce5a190..87cbe47b323e6 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -696,8 +696,6 @@ struct iommu_ops { * specific mechanisms. * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. - * @get_msi_mapping_domain: Return the related iommu_domain that should hold the - * MSI cookie and accept mapping(s). */ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev); @@ -727,8 +725,6 @@ struct iommu_domain_ops { unsigned long quirks); void (*free)(struct iommu_domain *domain); - struct iommu_domain * - (*get_msi_mapping_domain)(struct iommu_domain *domain); }; /** From 0cce0aea030f957370bb2cce71b2680d409c46fb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:36 -0800 Subject: [PATCH 006/147] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie The IOMMU translation for MSI message addresses has been a 2-step process, separated in time: 1) iommu_dma_prepare_msi(): A cookie pointer containing the IOVA address is stored in the MSI descriptor when an MSI interrupt is allocated. 2) iommu_dma_compose_msi_msg(): this cookie pointer is used to compute a translated message address. This has an inherent lifetime problem for the pointer stored in the cookie that must remain valid between the two steps. However, there is no locking at the irq layer that helps protect the lifetime. Today, this works under the assumption that the iommu domain is not changed while MSI interrupts being programmed. This is true for normal DMA API users within the kernel, as the iommu domain is attached before the driver is probed and cannot be changed while a driver is attached. Classic VFIO type1 also prevented changing the iommu domain while VFIO was running as it does not support changing the "container" after starting up. However, iommufd has improved this so that the iommu domain can be changed during VFIO operation. This potentially allows userspace to directly race VFIO_DEVICE_ATTACH_IOMMUFD_PT (which calls iommu_attach_group()) and VFIO_DEVICE_SET_IRQS (which calls into iommu_dma_compose_msi_msg()). This potentially causes both the cookie pointer and the unlocked call to iommu_get_domain_for_dev() on the MSI translation path to become UAFs. Fix the MSI cookie UAF by removing the cookie pointer. The translated IOVA address is already known during iommu_dma_prepare_msi() and cannot change. Thus, it can simply be stored as an integer in the MSI descriptor. The other UAF related to iommu_get_domain_for_dev() will be addressed in patch "iommu: Make iommu_dma_prepare_msi() into a generic operation" by using the IOMMU group mutex. Link: https://patch.msgid.link/r/a4f2cd76b9dc1833ee6c1cf325cba57def22231c.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe (cherry picked from commit 1f7df3a691740a7736bbc99dc4ed536120eb4746) Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 28 +++++++++++++--------------- include/linux/msi.h | 33 ++++++++++++--------------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 2a9fa0c8cc00f..0f0caf59023c7 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1815,7 +1815,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } @@ -1827,11 +1827,12 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; + + msi_desc_set_iommu_msi_iova( + desc, msi_page->iova, + ilog2(cookie_msi_granule(domain->iova_cookie))); return 0; } @@ -1842,18 +1843,15 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) */ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - const struct iommu_dma_msi_page *msi_page; +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); + } +#endif } static int iommu_dma_init(void) diff --git a/include/linux/msi.h b/include/linux/msi.h index 59a421fc42bf0..63d0e51f7a801 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -165,6 +165,10 @@ struct msi_desc_data { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -183,7 +187,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -284,28 +289,14 @@ struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, #define msi_desc_to_dev(desc) ((desc)->dev) -#ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) { - return NULL; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ -} +#ifdef CONFIG_IRQ_MSI_IOMMU + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; #endif +} int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); From f5af33dc94524b3b60a8d252bb9cfdb7b1a061e5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:37 -0800 Subject: [PATCH 007/147] genirq/msi: Refactor iommu_dma_compose_msi_msg() The two-step process to translate the MSI address involves two functions, iommu_dma_prepare_msi() and iommu_dma_compose_msi_msg(). Previously iommu_dma_compose_msi_msg() needed to be in the iommu layer as it had to dereference the opaque cookie pointer. Now, the previous patch changed the cookie pointer into an integer so there is no longer any need for the iommu layer to be involved. Further, the call sites of iommu_dma_compose_msi_msg() all follow the same pattern of setting an MSI message address_hi/lo to non-translated and then immediately calling iommu_dma_compose_msi_msg(). Refactor iommu_dma_compose_msi_msg() into msi_msg_set_addr() that directly accepts the u64 version of the address and simplifies all the callers. Move the new helper to linux/msi.h since it has nothing to do with iommu. Aside from refactoring, this logically prepares for the next patch, which allows multiple implementation options for iommu_dma_prepare_msi(). So, it does not make sense to have the iommu_dma_compose_msi_msg() in dma-iommu.c as it no longer provides the only iommu_dma_prepare_msi() implementation. Link: https://patch.msgid.link/r/eda62a9bafa825e9cdabd7ddc61ad5a21c32af24.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe (cherry picked from commit 9349887e93009331e751854843f73a086bef4018) Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 18 ------------------ drivers/irqchip/irq-gic-v2m.c | 5 +---- drivers/irqchip/irq-gic-v3-its.c | 13 +++---------- drivers/irqchip/irq-gic-v3-mbi.c | 12 ++++-------- drivers/irqchip/irq-ls-scfg-msi.c | 5 ++--- include/linux/iommu.h | 6 ------ include/linux/msi.h | 28 ++++++++++++++++++++++++++++ 7 files changed, 38 insertions(+), 49 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 0f0caf59023c7..bf91e014d1791 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1836,24 +1836,6 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; } -/** - * iommu_dma_compose_msi_msg() - Apply translation to an MSI message - * @desc: MSI descriptor prepared by iommu_dma_prepare_msi() - * @msg: MSI message containing target physical address - */ -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -#ifdef CONFIG_IRQ_MSI_IOMMU - if (desc->iommu_msi_shift) { - u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - - msg->address_hi = upper_32_bits(msi_iova); - msg->address_lo = lower_32_bits(msi_iova) | - (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); - } -#endif -} - static int iommu_dma_init(void) { if (is_kdump_kernel()) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index a1e370d0200f1..34f437207adf7 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -87,9 +87,6 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct v2m_data *v2m = irq_data_get_irq_chip_data(data); phys_addr_t addr = gicv2m_get_msi_addr(v2m, data->hwirq); - msg->address_hi = upper_32_bits(addr); - msg->address_lo = lower_32_bits(addr); - if (v2m->flags & GICV2M_GRAVITON_ADDRESS_ONLY) msg->data = 0; else @@ -97,7 +94,7 @@ static void gicv2m_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (v2m->flags & GICV2M_NEEDS_SPI_OFFSET) msg->data -= v2m->spi_offset; - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, addr); } static struct irq_chip gicv2m_irq_chip = { diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index f30ed281882ff..0115ad6c82593 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1811,17 +1811,10 @@ static u64 its_irq_get_msi_base(struct its_device *its_dev) static void its_irq_compose_msi_msg(struct irq_data *d, struct msi_msg *msg) { struct its_device *its_dev = irq_data_get_irq_chip_data(d); - struct its_node *its; - u64 addr; - - its = its_dev->its; - addr = its->get_msi_base(its_dev); - - msg->address_lo = lower_32_bits(addr); - msg->address_hi = upper_32_bits(addr); - msg->data = its_get_event_id(d); - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(d), msg); + msg->data = its_get_event_id(d); + msi_msg_set_addr(irq_data_get_msi_desc(d), msg, + its_dev->its->get_msi_base(its_dev)); } static int its_irq_set_irqchip_state(struct irq_data *d, diff --git a/drivers/irqchip/irq-gic-v3-mbi.c b/drivers/irqchip/irq-gic-v3-mbi.c index 3fe870f8ee174..a6510128611e0 100644 --- a/drivers/irqchip/irq-gic-v3-mbi.c +++ b/drivers/irqchip/irq-gic-v3-mbi.c @@ -147,22 +147,18 @@ static const struct irq_domain_ops mbi_domain_ops = { static void mbi_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) { - msg[0].address_hi = upper_32_bits(mbi_phys_base + GICD_SETSPI_NSR); - msg[0].address_lo = lower_32_bits(mbi_phys_base + GICD_SETSPI_NSR); msg[0].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[0], + mbi_phys_base + GICD_SETSPI_NSR); } static void mbi_compose_mbi_msg(struct irq_data *data, struct msi_msg *msg) { mbi_compose_msi_msg(data, msg); - msg[1].address_hi = upper_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); - msg[1].address_lo = lower_32_bits(mbi_phys_base + GICD_CLRSPI_NSR); msg[1].data = data->parent_data->hwirq; - - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), &msg[1]); + msi_msg_set_addr(irq_data_get_msi_desc(data), &msg[1], + mbi_phys_base + GICD_CLRSPI_NSR); } static bool mbi_init_dev_msi_info(struct device *dev, struct irq_domain *domain, diff --git a/drivers/irqchip/irq-ls-scfg-msi.c b/drivers/irqchip/irq-ls-scfg-msi.c index c0e1aafe468c3..3cb80796cc7cc 100644 --- a/drivers/irqchip/irq-ls-scfg-msi.c +++ b/drivers/irqchip/irq-ls-scfg-msi.c @@ -87,8 +87,6 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) { struct ls_scfg_msi *msi_data = irq_data_get_irq_chip_data(data); - msg->address_hi = upper_32_bits(msi_data->msiir_addr); - msg->address_lo = lower_32_bits(msi_data->msiir_addr); msg->data = data->hwirq; if (msi_affinity_flag) { @@ -98,7 +96,8 @@ static void ls_scfg_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) msg->data |= cpumask_first(mask); } - iommu_dma_compose_msi_msg(irq_data_get_msi_desc(data), msg); + msi_msg_set_addr(irq_data_get_msi_desc(data), msg, + msi_data->msiir_addr); } static int ls_scfg_msi_set_affinity(struct irq_data *irq_data, diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 87cbe47b323e6..319a85d86695e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1508,7 +1508,6 @@ static inline void iommu_debugfs_setup(void) {} int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); -void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg); #else /* CONFIG_IOMMU_DMA */ @@ -1524,11 +1523,6 @@ static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_a { return 0; } - -static inline void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) -{ -} - #endif /* CONFIG_IOMMU_DMA */ /* diff --git a/include/linux/msi.h b/include/linux/msi.h index 63d0e51f7a801..862cfec024763 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -298,6 +298,34 @@ static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_io #endif } +/** + * msi_msg_set_addr() - Set MSI address in an MSI message + * + * @desc: MSI descriptor that may carry an IOVA base address for MSI via @iommu_msi_iova/shift + * @msg: Target MSI message to set its address_hi and address_lo + * @msi_addr: Physical address to set the MSI message + * + * Notes: + * - Override @msi_addr using the IOVA base address in the @desc if @iommu_msi_shift is set + * - Otherwise, simply set @msi_addr to @msg + */ +static inline void msi_msg_set_addr(struct msi_desc *desc, struct msi_msg *msg, + phys_addr_t msi_addr) +{ +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msi_addr & ((1 << desc->iommu_msi_shift) - 1)); + return; + } +#endif + msg->address_hi = upper_32_bits(msi_addr); + msg->address_lo = lower_32_bits(msi_addr); +} + int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, struct msi_desc *init_desc); /** From 642d1986fc486025f50a2eddc98e37cacff56d44 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:38 -0800 Subject: [PATCH 008/147] iommu: Make iommu_dma_prepare_msi() into a generic operation SW_MSI supports IOMMU to translate an MSI message before the MSI message is delivered to the interrupt controller. On such systems, an iommu_domain must have a translation for the MSI message for interrupts to work. The IRQ subsystem will call into IOMMU to request that a physical page be set up to receive MSI messages, and the IOMMU then sets an IOVA that maps to that physical page. Ultimately the IOVA is programmed into the device via the msi_msg. Generalize this by allowing iommu_domain owners to provide implementations of this mapping. Add a function pointer in struct iommu_domain to allow a domain owner to provide its own implementation. Have dma-iommu supply its implementation for IOMMU_DOMAIN_DMA types during the iommu_get_dma_cookie() path. For IOMMU_DOMAIN_UNMANAGED types used by VFIO (and iommufd for now), have the same iommu_dma_sw_msi set as well in the iommu_get_msi_cookie() path. Hold the group mutex while in iommu_dma_prepare_msi() to ensure the domain doesn't change or become freed while running. Races with IRQ operations from VFIO and domain changes from iommufd are possible here. Replace the msi_prepare_lock with a lockdep assertion for the group mutex as documentation. For the dmau_iommu.c each iommu_domain is unique to a group. Link: https://patch.msgid.link/r/4ca696150d2baee03af27c4ddefdb7b0b0280e7b.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 288683c92b1abc32277c83819bea287af614d239) Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 33 +++++++++++++---------------- drivers/iommu/iommu.c | 29 ++++++++++++++++++++++++++ include/linux/iommu.h | 44 ++++++++++++++++++++++++++------------- 3 files changed, 73 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index bf91e014d1791..3b58244e6344a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,9 @@ static int __init iommu_dma_forcedac_setup(char *str) } early_param("iommu.forcedac", iommu_dma_forcedac_setup); +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + /* Number of entries per flush queue */ #define IOVA_DEFAULT_FQ_SIZE 256 #define IOVA_SINGLE_FQ_SIZE 32768 @@ -398,6 +402,7 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) return -ENOMEM; mutex_init(&domain->iova_cookie->mutex); + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } @@ -429,6 +434,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) cookie->msi_iova = base; domain->iova_cookie = cookie; + iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); @@ -443,6 +449,9 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; + if (domain->sw_msi != iommu_dma_sw_msi) + return; + if (!cookie) return; @@ -1800,33 +1809,19 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } -/** - * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain - * @desc: MSI descriptor, will store the MSI page - * @msi_addr: MSI target address to be mapped - * - * Return: 0 on success or negative error code if the mapping failed. - */ -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_dma_msi_page *msi_page; - static DEFINE_MUTEX(msi_prepare_lock); /* see below */ + const struct iommu_dma_msi_page *msi_page; - if (!domain || !domain->iova_cookie) { + if (!domain->iova_cookie) { msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } - /* - * In fact the whole prepare operation should already be serialised by - * irq_domain_mutex further up the callchain, but that's pretty subtle - * on its own, so consider this locking as failsafe documentation... - */ - mutex_lock(&msi_prepare_lock); + iommu_group_mutex_assert(dev); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); - mutex_unlock(&msi_prepare_lock); if (!msi_page) return -ENOMEM; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b3235cb3220ce..b1437a8dfe0ae 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3789,3 +3789,32 @@ int iommu_replace_group_handle(struct iommu_group *group, return ret; } EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, "IOMMUFD_INTERNAL"); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +/** + * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain + * @desc: MSI descriptor, will store the MSI page + * @msi_addr: MSI target address to be mapped + * + * The implementation of sw_msi() should take msi_addr and map it to + * an IOVA in the domain and call msi_desc_set_iommu_msi_iova() with the + * mapping information. + * + * Return: 0 on success or negative error code if the mapping failed. + */ +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommu_group *group = dev->iommu_group; + int ret = 0; + + if (!group) + return 0; + + mutex_lock(&group->mutex); + if (group->domain && group->domain->sw_msi) + ret = group->domain->sw_msi(group->domain, desc, msi_addr); + mutex_unlock(&group->mutex); + return ret; +} +#endif /* CONFIG_IRQ_MSI_IOMMU */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 319a85d86695e..20a133d2dba0a 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -44,6 +44,8 @@ struct iommu_dma_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; +struct msi_desc; +struct msi_msg; #define IOMMU_FAULT_PERM_READ (1 << 0) /* read */ #define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */ @@ -216,6 +218,12 @@ struct iommu_domain { struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); + +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#endif + void *fault_data; union { struct { @@ -234,6 +242,16 @@ struct iommu_domain { }; }; +static inline void iommu_domain_set_sw_msi( + struct iommu_domain *domain, + int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr)) +{ +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) + domain->sw_msi = sw_msi; +#endif +} + static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; @@ -1470,6 +1488,18 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev) static inline void iommu_free_global_pasid(ioasid_t pasid) {} #endif /* CONFIG_IOMMU_API */ +#ifdef CONFIG_IRQ_MSI_IOMMU +#ifdef CONFIG_IOMMU_API +int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); +#else +static inline int iommu_dma_prepare_msi(struct msi_desc *desc, + phys_addr_t msi_addr) +{ + return 0; +} +#endif /* CONFIG_IOMMU_API */ +#endif /* CONFIG_IRQ_MSI_IOMMU */ + #if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API) void iommu_group_mutex_assert(struct device *dev); #else @@ -1503,26 +1533,12 @@ static inline void iommu_debugfs_setup(void) {} #endif #ifdef CONFIG_IOMMU_DMA -#include - int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base); - -int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr); - #else /* CONFIG_IOMMU_DMA */ - -struct msi_desc; -struct msi_msg; - static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { return -ENODEV; } - -static inline int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) -{ - return 0; -} #endif /* CONFIG_IOMMU_DMA */ /* From d31ea235ef20b7b1afaa446b1f92b727b7464ec1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:46 -0800 Subject: [PATCH 009/147] iommu: Make @handle mandatory in iommu_{attach|replace}_group_handle() Caller of the two APIs always provide a valid handle, make @handle as mandatory parameter. Take this chance incoporate the handle->domain set under the protection of group->mutex in iommu_attach_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-2-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 237603a46abf9637f9b71c6225293fac2b4d6ef7) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index b1437a8dfe0ae..7575208293bdf 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3704,10 +3704,11 @@ int iommu_attach_group_handle(struct iommu_domain *domain, { int ret; - if (handle) - handle->domain = domain; + if (!handle) + return -EINVAL; mutex_lock(&group->mutex); + handle->domain = domain; ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); if (ret) goto err_unlock; @@ -3761,16 +3762,14 @@ int iommu_replace_group_handle(struct iommu_group *group, void *curr; int ret; - if (!new_domain) + if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - if (handle) { - ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); - if (ret) - goto err_unlock; - handle->domain = new_domain; - } + handle->domain = new_domain; + ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); + if (ret) + goto err_unlock; ret = __iommu_group_set_domain(group, new_domain); if (ret) From ce8b472b99556de520fb4cba68c244f825d2599d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:47 -0800 Subject: [PATCH 010/147] iommu: Drop iommu_group_replace_domain() iommufd does not use it now, so drop it. Link: https://patch.msgid.link/r/20250226011849.5102-3-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit 473ec072a63370e37dddbadb2a7cc2419a0fdb28) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu-priv.h | 3 --- drivers/iommu/iommu.c | 35 ++++++----------------------------- 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index de5b54eaa8bf1..b4508423e13ba 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,9 +24,6 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain); - int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 7575208293bdf..21bdae13d9636 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2364,32 +2364,6 @@ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_attach_group); -/** - * iommu_group_replace_domain - replace the domain that a group is attached to - * @group: IOMMU group that will be attached to the new domain - * @new_domain: new IOMMU domain to replace with - * - * This API allows the group to switch domains without being forced to go to - * the blocking domain in-between. - * - * If the currently attached domain is a core domain (e.g. a default_domain), - * it will act just like the iommu_attach_group(). - */ -int iommu_group_replace_domain(struct iommu_group *group, - struct iommu_domain *new_domain) -{ - int ret; - - if (!new_domain) - return -EINVAL; - - mutex_lock(&group->mutex); - ret = __iommu_group_set_domain(group, new_domain); - mutex_unlock(&group->mutex); - return ret; -} -EXPORT_SYMBOL_NS_GPL(iommu_group_replace_domain, "IOMMUFD_INTERNAL"); - static int __iommu_device_set_domain(struct iommu_group *group, struct device *dev, struct iommu_domain *new_domain, @@ -3751,9 +3725,12 @@ EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); * @new_domain: new IOMMU domain to replace with * @handle: attach handle * - * This is a variant of iommu_group_replace_domain(). It allows the caller to - * provide an attach handle for the new domain and use it when the domain is - * attached. + * This API allows the group to switch domains without being forced to go to + * the blocking domain in-between. It allows the caller to provide an attach + * handle for the new domain and use it when the domain is attached. + * + * If the currently attached domain is a core domain (e.g. a default_domain), + * it will act just like the iommu_attach_group_handle(). */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, From 1d6c10df43c8ad01cf6a563aa0cfa2f4b4ce42f8 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:48 -0800 Subject: [PATCH 011/147] iommu: Store either domain or handle in group->pasid_array iommu_attach_device_pasid() only stores handle to group->pasid_array when there is a valid handle input. However, it makes the iommu_attach_device_pasid() unable to detect if the pasid has been attached or not previously. To be complete, let the iommu_attach_device_pasid() store the domain to group->pasid_array if no valid handle. The other users of the group->pasid_array should be updated to be consistent. e.g. the iommu_attach_group_handle() and iommu_replace_group_handle(). Link: https://patch.msgid.link/r/20250226011849.5102-4-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit e1ea9d30d84c65e96eba27b240be5b6798350490) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 21bdae13d9636..d0bb013743913 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -45,6 +45,9 @@ static unsigned int iommu_def_domain_type __read_mostly; static bool iommu_dma_strict __read_mostly = IS_ENABLED(CONFIG_IOMMU_DEFAULT_DMA_STRICT); static u32 iommu_cmd_line __read_mostly; +/* Tags used with xa_tag_pointer() in group->pasid_array */ +enum { IOMMU_PASID_ARRAY_DOMAIN = 0, IOMMU_PASID_ARRAY_HANDLE = 1 }; + struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; @@ -2324,6 +2327,17 @@ struct iommu_domain *iommu_get_dma_domain(struct device *dev) return dev->iommu_group->default_domain; } +static void *iommu_make_pasid_array_entry(struct iommu_domain *domain, + struct iommu_attach_handle *handle) +{ + if (handle) { + handle->domain = domain; + return xa_tag_pointer(handle, IOMMU_PASID_ARRAY_HANDLE); + } + + return xa_tag_pointer(domain, IOMMU_PASID_ARRAY_DOMAIN); +} + static int __iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { @@ -3535,6 +3549,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, struct iommu_group *group = dev->iommu_group; struct group_device *device; const struct iommu_ops *ops; + void *entry; int ret; if (!group) @@ -3564,10 +3579,9 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } } - if (handle) - handle->domain = domain; + entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, handle, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); if (ret) goto out_unlock; @@ -3647,13 +3661,17 @@ struct iommu_attach_handle * iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type) { struct iommu_attach_handle *handle; + void *entry; xa_lock(&group->pasid_array); - handle = xa_load(&group->pasid_array, pasid); - if (!handle) + entry = xa_load(&group->pasid_array, pasid); + if (!entry || xa_pointer_tag(entry) != IOMMU_PASID_ARRAY_HANDLE) { handle = ERR_PTR(-ENOENT); - else if (type && handle->domain->type != type) - handle = ERR_PTR(-EBUSY); + } else { + handle = xa_untag_pointer(entry); + if (type && handle->domain->type != type) + handle = ERR_PTR(-EBUSY); + } xa_unlock(&group->pasid_array); return handle; @@ -3676,14 +3694,15 @@ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, struct iommu_attach_handle *handle) { + void *entry; int ret; if (!handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = domain; - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + entry = iommu_make_pasid_array_entry(domain, handle); + ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); if (ret) goto err_unlock; @@ -3736,14 +3755,14 @@ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle) { - void *curr; + void *curr, *entry; int ret; if (!new_domain || !handle) return -EINVAL; mutex_lock(&group->mutex); - handle->domain = new_domain; + entry = iommu_make_pasid_array_entry(new_domain, handle); ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL); if (ret) goto err_unlock; @@ -3752,7 +3771,7 @@ int iommu_replace_group_handle(struct iommu_group *group, if (ret) goto err_release; - curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL); + curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); WARN_ON(xa_is_err(curr)); mutex_unlock(&group->mutex); From 11c146fe0ee81c7f15be15613aa27b8e4f5f5bba Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 25 Feb 2025 17:18:49 -0800 Subject: [PATCH 012/147] iommu: Swap the order of setting group->pasid_array and calling attach op of iommu drivers The current implementation stores entry to the group->pasid_array before the underlying iommu driver has successfully set the new domain. This can lead to issues where PRIs are received on the new domain before the attach operation is completed. This patch swaps the order of operations to ensure that the domain is set in the underlying iommu driver before updating the group->pasid_array. Link: https://patch.msgid.link/r/20250226011849.5102-5-yi.l.liu@intel.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 5e9f822c9c683ae884fa5e71df41d1647b2876c6) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 48 ++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index d0bb013743913..1fc751263898e 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3581,13 +3581,29 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, pasid, entry, GFP_KERNEL); + /* + * Entry present is a failure case. Use xa_insert() instead of + * xa_reserve(). + */ + ret = xa_insert(&group->pasid_array, pasid, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) goto out_unlock; ret = __iommu_set_group_pasid(domain, group, pasid); - if (ret) - xa_erase(&group->pasid_array, pasid); + if (ret) { + xa_release(&group->pasid_array, pasid); + goto out_unlock; + } + + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + out_unlock: mutex_unlock(&group->mutex); return ret; @@ -3702,19 +3718,27 @@ int iommu_attach_group_handle(struct iommu_domain *domain, mutex_lock(&group->mutex); entry = iommu_make_pasid_array_entry(domain, handle); - ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, entry, GFP_KERNEL); + ret = xa_insert(&group->pasid_array, + IOMMU_NO_PASID, XA_ZERO_ENTRY, GFP_KERNEL); if (ret) - goto err_unlock; + goto out_unlock; ret = __iommu_attach_group(domain, group); - if (ret) - goto err_erase; - mutex_unlock(&group->mutex); + if (ret) { + xa_release(&group->pasid_array, IOMMU_NO_PASID); + goto out_unlock; + } - return 0; -err_erase: - xa_erase(&group->pasid_array, IOMMU_NO_PASID); -err_unlock: + /* + * The xa_insert() above reserved the memory, and the group->mutex is + * held, this cannot fail. The new domain cannot be visible until the + * operation succeeds as we cannot tolerate PRIs becoming concurrently + * queued and then failing attach. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + IOMMU_NO_PASID, entry, GFP_KERNEL))); + +out_unlock: mutex_unlock(&group->mutex); return ret; } From d083a790898bc585f2d03741c7da77774122e8f3 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 27 Feb 2025 14:47:47 +0000 Subject: [PATCH 013/147] iommu: Unexport iommu_fwspec_free() The drivers doing their own fwspec parsing have no need to call iommu_fwspec_free() since fwspecs were moved into dev_iommu, as returning an error from .probe_device will tear down the whole lot anyway. Move it into the private interface now that it only serves for of_iommu to clean up in an error case. I have no idea what mtk_v1 was doing in effectively guaranteeing a NULL fwspec would be dereferenced if no "iommus" DT property was found, so add a check for that to at least make the code look sane. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/36e245489361de2d13db22a510fa5c79e7126278.1740667667.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel (cherry picked from commit 29c6e1c2b923b43e8082bba5c6675185a8fe305a) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu/arm-smmu.c | 1 - drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 1 - drivers/iommu/mtk_iommu_v1.c | 14 ++++---------- drivers/iommu/tegra-smmu.c | 1 - include/linux/iommu.h | 5 ----- 6 files changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index de205a34ffc6d..ea9f8e484e354 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1486,7 +1486,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev) out_cfg_free: kfree(cfg); out_free: - iommu_fwspec_free(dev); return ERR_PTR(ret); } diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index b4508423e13ba..fed55fbfe99cb 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -24,6 +24,8 @@ static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwsp return iommu_ops_from_fwnode(fwspec ? fwspec->iommu_fwnode : NULL); } +void iommu_fwspec_free(struct device *dev); + int iommu_device_register_bus(struct iommu_device *iommu, const struct iommu_ops *ops, const struct bus_type *bus, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1fc751263898e..71401e5a1c263 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3014,7 +3014,6 @@ void iommu_fwspec_free(struct device *dev) dev_iommu_fwspec_set(dev, NULL); } } -EXPORT_SYMBOL_GPL(iommu_fwspec_free); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) { diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c index a565b9e40f4a6..3e724e7f10f02 100644 --- a/drivers/iommu/mtk_iommu_v1.c +++ b/drivers/iommu/mtk_iommu_v1.c @@ -446,22 +446,13 @@ static int mtk_iommu_v1_create_mapping(struct device *dev, static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) { - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + struct iommu_fwspec *fwspec = NULL; struct of_phandle_args iommu_spec; struct mtk_iommu_v1_data *data; int err, idx = 0, larbid, larbidx; struct device_link *link; struct device *larbdev; - /* - * In the deferred case, free the existed fwspec. - * Always initialize the fwspec internally. - */ - if (fwspec) { - iommu_fwspec_free(dev); - fwspec = dev_iommu_fwspec_get(dev); - } - while (!of_parse_phandle_with_args(dev->of_node, "iommus", "#iommu-cells", idx, &iommu_spec)) { @@ -476,6 +467,9 @@ static struct iommu_device *mtk_iommu_v1_probe_device(struct device *dev) idx++; } + if (!fwspec) + return ERR_PTR(-ENODEV); + data = dev_iommu_priv_get(dev); /* Link the consumer device with the smi-larb device(supplier) */ diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 7f633bb5efef1..69d353e1df843 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -846,7 +846,6 @@ static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev, err = ops->of_xlate(dev, args); if (err < 0) { dev_err(dev, "failed to parse SW group ID: %d\n", err); - iommu_fwspec_free(dev); return err; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 20a133d2dba0a..f17555e715f57 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1097,7 +1097,6 @@ struct iommu_mm_data { }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); -void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids); static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev) @@ -1408,10 +1407,6 @@ static inline int iommu_fwspec_init(struct device *dev, return -ENODEV; } -static inline void iommu_fwspec_free(struct device *dev) -{ -} - static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids) { From 432264ef30b31447abb340111736d566ebeb829a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 28 Feb 2025 15:46:32 +0000 Subject: [PATCH 014/147] iommu: Keep dev->iommu state consistent At the moment, if of_iommu_configure() allocates dev->iommu itself via iommu_fwspec_init(), then suffers a DT parsing failure, it cleans up the fwspec but leaves the empty dev_iommu hanging around. So far this is benign (if a tiny bit wasteful), but we'd like to be able to reason about dev->iommu having a consistent and unambiguous lifecycle. Thus make sure that the of_iommu cleanup undoes precisely whatever it did. Signed-off-by: Robin Murphy Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/d219663a3f23001f23d520a883ac622d70b4e642.1740753261.git.robin.murphy@arm.com Signed-off-by: Joerg Roedel (cherry picked from commit 3832862eb9c4dfa0e80b2522bfaedbc8a43de97d) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu-priv.h | 2 ++ drivers/iommu/iommu.c | 2 +- drivers/iommu/of_iommu.c | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index fed55fbfe99cb..05fa6e682e88d 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -17,6 +17,8 @@ static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) return dev->iommu->iommu_dev->ops; } +void dev_iommu_free(struct device *dev); + const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode); static inline const struct iommu_ops *iommu_fwspec_ops(struct iommu_fwspec *fwspec) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 71401e5a1c263..a95a92d82de06 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -363,7 +363,7 @@ static struct dev_iommu *dev_iommu_get(struct device *dev) return param; } -static void dev_iommu_free(struct device *dev) +void dev_iommu_free(struct device *dev) { struct dev_iommu *param = dev->iommu; diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c index 97987cd78da93..e10a68b5ffde1 100644 --- a/drivers/iommu/of_iommu.c +++ b/drivers/iommu/of_iommu.c @@ -116,6 +116,7 @@ static void of_pci_check_device_ats(struct device *dev, struct device_node *np) int of_iommu_configure(struct device *dev, struct device_node *master_np, const u32 *id) { + bool dev_iommu_present; int err; if (!master_np) @@ -127,6 +128,7 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, mutex_unlock(&iommu_probe_device_lock); return 0; } + dev_iommu_present = dev->iommu; /* * We don't currently walk up the tree looking for a parent IOMMU. @@ -147,8 +149,10 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np, err = of_iommu_configure_device(master_np, dev, id); } - if (err) + if (err && dev_iommu_present) iommu_fwspec_free(dev); + else if (err && dev->iommu) + dev_iommu_free(dev); mutex_unlock(&iommu_probe_device_lock); if (!err && dev->bus) From b6f92743be6a78bd9dbd696e19ac4bc60d4266fb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:39 -0800 Subject: [PATCH 015/147] irqchip: Have CONFIG_IRQ_MSI_IOMMU be selected by irqchips that need it Currently, IRQ_MSI_IOMMU is selected if DMA_IOMMU is available to provide an implementation for iommu_dma_prepare/compose_msi_msg(). However, it'll make more sense for irqchips that call prepare/compose to select it, and that will trigger all the additional code and data to be compiled into the kernel. If IRQ_MSI_IOMMU is selected with no IOMMU side implementation, then the prepare/compose() will be NOP stubs. If IRQ_MSI_IOMMU is not selected by an irqchip, then the related code on the iommu side is compiled out. Link: https://patch.msgid.link/r/a2620f67002c5cdf974e89ca3bf905f5c0817be6.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe (cherry picked from commit 96093fe54f4864b07013cf61b847708233832841) Signed-off-by: Nirmoy Das --- drivers/iommu/Kconfig | 1 - drivers/iommu/dma-iommu.c | 2 ++ drivers/irqchip/Kconfig | 4 ++++ kernel/irq/Kconfig | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index ec1b5e32b9725..5124e7431fe31 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -154,7 +154,6 @@ config IOMMU_DMA select DMA_OPS_HELPERS select IOMMU_API select IOMMU_IOVA - select IRQ_MSI_IOMMU select NEED_SG_DMA_LENGTH select NEED_SG_DMA_FLAGS if SWIOTLB diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 3b58244e6344a..94263ed2c5644 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -449,8 +449,10 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; +#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) if (domain->sw_msi != iommu_dma_sw_msi) return; +#endif if (!cookie) return; diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig index c11b9965c4ad9..64658a1c3aa18 100644 --- a/drivers/irqchip/Kconfig +++ b/drivers/irqchip/Kconfig @@ -28,6 +28,7 @@ config ARM_GIC_V2M select ARM_GIC select IRQ_MSI_LIB select PCI_MSI + select IRQ_MSI_IOMMU config GIC_NON_BANKED bool @@ -38,12 +39,14 @@ config ARM_GIC_V3 select PARTITION_PERCPU select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP select HAVE_ARM_SMCCC_DISCOVERY + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS bool select GENERIC_MSI_IRQ select IRQ_MSI_LIB default ARM_GIC_V3 + select IRQ_MSI_IOMMU config ARM_GIC_V3_ITS_FSL_MC bool @@ -408,6 +411,7 @@ config LS_EXTIRQ config LS_SCFG_MSI def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE + select IRQ_MSI_IOMMU depends on PCI_MSI config PARTITION_PERCPU diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 875f25ed6f710..e126e6ce510e4 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -96,6 +96,7 @@ config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY +# irqchip drivers should select this if they call iommu_dma_prepare_msi() config IRQ_MSI_IOMMU bool From 0b34d70cb07ec71c5c3ff57f13e79fd0df3d6bec Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 19 Feb 2025 17:31:40 -0800 Subject: [PATCH 016/147] iommu: Turn fault_data to iommufd private pointer A "fault_data" was added exclusively for the iommufd_fault_iopf_handler() used by IOPF/PRI use cases, along with the attach_handle. Now, the iommufd version of the sw_msi function will reuse the attach_handle and fault_data for a non-fault case. Rename "fault_data" to "iommufd_hwpt" so as not to confine it to a "fault" case. Move it into a union to be the iommufd private pointer. A following patch will move the iova_cookie to the union for dma-iommu too after the iommufd_sw_msi implementation is added. Since we have two unions now, add some simple comments for readability. Link: https://patch.msgid.link/r/ee5039503f28a16590916e9eef28b917e2d1607a.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe (cherry picked from commit 748706d7ca06012621b32851b68136bf33613b9e) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/fault.c | 2 +- drivers/iommu/iommufd/hw_pagetable.c | 2 +- include/linux/iommu.h | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 931a3fbe6e32c..c48d72c9668ca 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -329,7 +329,7 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) struct iommufd_hw_pagetable *hwpt; struct iommufd_fault *fault; - hwpt = group->attach_handle->domain->fault_data; + hwpt = group->attach_handle->domain->iommufd_hwpt; fault = hwpt->fault; spin_lock(&fault->lock); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 598be26a14e28..2641d50f46cf1 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -406,10 +406,10 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - hwpt->domain->fault_data = hwpt; refcount_inc(&fault->obj.users); iommufd_put_object(ucmd->ictx, &fault->obj); } + hwpt->domain->iommufd_hwpt = hwpt; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index f17555e715f57..a8d6a095d6540 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -224,8 +224,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - void *fault_data; - union { + union { /* Pointer usable by owner of the domain */ + struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ + }; + union { /* Fault handler */ struct { iommu_fault_handler_t handler; void *handler_token; From fce7a9115b3cba7357e854902350478168785496 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:41 -0800 Subject: [PATCH 017/147] iommufd: Implement sw_msi support natively iommufd has a model where the iommu_domain can be changed while the VFIO device is attached. In this case, the MSI should continue to work. This corner case has not worked because the dma-iommu implementation of sw_msi is tied to a single domain. Implement the sw_msi mapping directly and use a global per-fd table to associate assigned IOVA to the MSI pages. This allows the MSI pages to be loaded into a domain before it is attached ensuring that MSI is not disrupted. Link: https://patch.msgid.link/r/e13d23eeacd67c0a692fc468c85b483f4dd51c57.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 40f5175d0eb77f902ba8e2a5df2a8f3a218c8843) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 161 ++++++++++++++++++++---- drivers/iommu/iommufd/hw_pagetable.c | 3 + drivers/iommu/iommufd/iommufd_private.h | 23 +++- drivers/iommu/iommufd/main.c | 9 ++ 4 files changed, 173 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 3c7800d4ab622..bd50146e2ad06 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../iommu-priv.h" #include "io_pagetable.h" @@ -293,36 +294,152 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static __maybe_unused struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +static int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +#endif + static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { - phys_addr_t sw_msi_start = igroup->sw_msi_start; - int rc; + struct iommufd_ctx *ictx = igroup->ictx; + struct iommufd_sw_msi_map *cur; + + if (igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; /* - * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to - * call iommu_get_msi_cookie() on its behalf. This is necessary to setup - * the MSI window so iommu_dma_prepare_msi() can install pages into our - * domain after request_irq(). If it is not done interrupts will not - * work on this domain. - * - * FIXME: This is conceptually broken for iommufd since we want to allow - * userspace to change the domains, eg switch from an identity IOAS to a - * DMA IOAS. There is currently no way to create a MSI window that - * matches what the IRQ layer actually expects in a newly created - * domain. + * Install all the MSI pages the device has been using into the domain */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt_paging->common.domain, - sw_msi_start); + guard(mutex)(&ictx->sw_msi_lock); + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + int rc; + + if (cur->sw_msi_start != igroup->sw_msi_start || + !test_bit(cur->id, igroup->required_sw_msi.bitmap)) + continue; + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur); if (rc) return rc; - - /* - * iommu_get_msi_cookie() can only be called once per domain, - * it returns -EBUSY on later calls. - */ - hwpt_paging->msi_cookie = true; } return 0; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 2641d50f46cf1..7de6e914232e7 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -156,6 +156,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -251,6 +252,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; @@ -307,6 +309,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, goto out_abort; } hwpt->domain->owner = viommu->iommu_dev->ops; + iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8e0e3ab647476..246297452a44e 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -19,6 +19,22 @@ struct iommu_group; struct iommu_option; struct iommufd_device; +struct iommufd_sw_msi_map { + struct list_head sw_msi_item; + phys_addr_t sw_msi_start; + phys_addr_t msi_addr; + unsigned int pgoff; + unsigned int id; +}; + +/* Bitmap of struct iommufd_sw_msi_map::id */ +struct iommufd_sw_msi_maps { + DECLARE_BITMAP(bitmap, 64); +}; + +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + struct iommufd_ctx { struct file *file; struct xarray objects; @@ -26,6 +42,10 @@ struct iommufd_ctx { wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; + struct mutex sw_msi_lock; + struct list_head sw_msi_list; + unsigned int sw_msi_id; + u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; @@ -283,10 +303,10 @@ struct iommufd_hwpt_paging { struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; - bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; + struct iommufd_sw_msi_maps present_sw_msi; }; struct iommufd_hwpt_nested { @@ -383,6 +403,7 @@ struct iommufd_group { struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; + struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index ccf616462a1cb..b6fa9fd11bc18 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -227,6 +227,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init(&ictx->groups); ictx->file = filp; init_waitqueue_head(&ictx->destroy_wait); + mutex_init(&ictx->sw_msi_lock); + INIT_LIST_HEAD(&ictx->sw_msi_list); filp->private_data = ictx; return 0; } @@ -234,6 +236,8 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) static int iommufd_fops_release(struct inode *inode, struct file *filp) { struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_sw_msi_map *next; + struct iommufd_sw_msi_map *cur; struct iommufd_object *obj; /* @@ -262,6 +266,11 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) break; } WARN_ON(!xa_empty(&ictx->groups)); + + mutex_destroy(&ictx->sw_msi_lock); + list_for_each_entry_safe(cur, next, &ictx->sw_msi_list, sw_msi_item) + kfree(cur); + kfree(ictx); return 0; } From 74fde917979ed39e81d5e2ea9401edde0d4b169f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:19 -0700 Subject: [PATCH 018/147] iommufd/fault: Move two fault functions out of the header There is no need to keep them in the header. The vEVENTQ version of these two functions will turn out to be a different implementation and will not share with this fault version. Thus, move them out of the header. Link: https://patch.msgid.link/r/7eebe32f3d354799f5e28128c693c3c284740b21.1741719725.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit dbf00d7d89125802113a9d8ea4c77ab9f4de8866) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/fault.c | 25 +++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 25 ------------------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index c48d72c9668ca..29e3a97c73c68 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -138,6 +138,31 @@ static void iommufd_compose_fault_message(struct iommu_fault *fault, hwpt_fault->cookie = cookie; } +/* Fetch the first node out of the fault->deliver list */ +static struct iopf_group * +iommufd_fault_deliver_fetch(struct iommufd_fault *fault) +{ + struct list_head *list = &fault->deliver; + struct iopf_group *group = NULL; + + spin_lock(&fault->lock); + if (!list_empty(list)) { + group = list_first_entry(list, struct iopf_group, node); + list_del(&group->node); + } + spin_unlock(&fault->lock); + return group; +} + +/* Restore a node back to the head of the fault->deliver list */ +static void iommufd_fault_deliver_restore(struct iommufd_fault *fault, + struct iopf_group *group) +{ + spin_lock(&fault->lock); + list_add(&group->node, &fault->deliver); + spin_unlock(&fault->lock); +} + static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 246297452a44e..1c58f5fe17b42 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -472,31 +472,6 @@ struct iommufd_fault { struct wait_queue_head wait_queue; }; -/* Fetch the first node out of the fault->deliver list */ -static inline struct iopf_group * -iommufd_fault_deliver_fetch(struct iommufd_fault *fault) -{ - struct list_head *list = &fault->deliver; - struct iopf_group *group = NULL; - - spin_lock(&fault->lock); - if (!list_empty(list)) { - group = list_first_entry(list, struct iopf_group, node); - list_del(&group->node); - } - spin_unlock(&fault->lock); - return group; -} - -/* Restore a node back to the head of the fault->deliver list */ -static inline void iommufd_fault_deliver_restore(struct iommufd_fault *fault, - struct iopf_group *group) -{ - spin_lock(&fault->lock); - list_add(&group->node, &fault->deliver); - spin_unlock(&fault->lock); -} - struct iommufd_attach_handle { struct iommu_attach_handle handle; struct iommufd_device *idev; From 8aa8b03d56032f76378b0a158290857ed7b46d7f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:20 -0700 Subject: [PATCH 019/147] iommufd/fault: Add an iommufd_fault_init() helper The infrastructure of a fault object will be shared with a new vEVENTQ object in a following change. Add an iommufd_fault_init helper and an INIT_EVENTQ_FOPS marco for a vEVENTQ allocator to use too. Reorder the iommufd_ctx_get and refcount_inc, to keep them symmetrical with the iommufd_fault_fops_release(). Since the new vEVENTQ doesn't need "response" and its "mutex", so keep the xa_init_flags and mutex_init in their original locations. Link: https://patch.msgid.link/r/a9522c521909baeb1bd843950b2490478f3d06e0.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 927dabc9aa4dbebf92b34da9b7acd7d8d5c6331b) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/fault.c | 70 +++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 29e3a97c73c68..5d8de98732b6f 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -280,20 +280,49 @@ static int iommufd_fault_fops_release(struct inode *inode, struct file *filep) return 0; } -static const struct file_operations iommufd_fault_fops = { - .owner = THIS_MODULE, - .open = nonseekable_open, - .read = iommufd_fault_fops_read, - .write = iommufd_fault_fops_write, - .poll = iommufd_fault_fops_poll, - .release = iommufd_fault_fops_release, -}; +#define INIT_FAULT_FOPS(read_op, write_op) \ + ((const struct file_operations){ \ + .owner = THIS_MODULE, \ + .open = nonseekable_open, \ + .read = read_op, \ + .write = write_op, \ + .poll = iommufd_fault_fops_poll, \ + .release = iommufd_fault_fops_release, \ + }) + +static int iommufd_fault_init(struct iommufd_fault *fault, char *name, + struct iommufd_ctx *ictx, + const struct file_operations *fops) +{ + struct file *filep; + int fdno; + + spin_lock_init(&fault->lock); + INIT_LIST_HEAD(&fault->deliver); + init_waitqueue_head(&fault->wait_queue); + + filep = anon_inode_getfile(name, fops, fault, O_RDWR); + if (IS_ERR(filep)) + return PTR_ERR(filep); + + fault->ictx = ictx; + iommufd_ctx_get(fault->ictx); + fault->filep = filep; + refcount_inc(&fault->obj.users); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) + fput(filep); + return fdno; +} + +static const struct file_operations iommufd_fault_fops = + INIT_FAULT_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) { struct iommu_fault_alloc *cmd = ucmd->cmd; struct iommufd_fault *fault; - struct file *filep; int fdno; int rc; @@ -304,28 +333,14 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) if (IS_ERR(fault)) return PTR_ERR(fault); - fault->ictx = ucmd->ictx; - INIT_LIST_HEAD(&fault->deliver); xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); mutex_init(&fault->mutex); - spin_lock_init(&fault->lock); - init_waitqueue_head(&fault->wait_queue); - - filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops, - fault, O_RDWR); - if (IS_ERR(filep)) { - rc = PTR_ERR(filep); - goto out_abort; - } - refcount_inc(&fault->obj.users); - iommufd_ctx_get(fault->ictx); - fault->filep = filep; - - fdno = get_unused_fd_flags(O_CLOEXEC); + fdno = iommufd_fault_init(fault, "[iommufd-pgfault]", ucmd->ictx, + &iommufd_fault_fops); if (fdno < 0) { rc = fdno; - goto out_fput; + goto out_abort; } cmd->out_fault_id = fault->obj.id; @@ -341,8 +356,7 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) return 0; out_put_fdno: put_unused_fd(fdno); -out_fput: - fput(filep); + fput(fault->filep); out_abort: iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); From b3068adf89605badf4653a642419ea3501bc2a6b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:21 -0700 Subject: [PATCH 020/147] iommufd: Abstract an iommufd_eventq from iommufd_fault The fault object was designed exclusively for hwpt's IO page faults (PRI). But its queue implementation can be reused for other purposes too, such as hardware IRQ and event injections to user space. Meanwhile, a fault object holds a list of faults. So it's more accurate to call it a "fault queue". Combining the reusing idea above, abstract a new iommufd_eventq as a common structure embedded into struct iommufd_fault, similar to hwpt_paging holding a common hwpt. Add a common iommufd_eventq_ops and iommufd_eventq_init to prepare for an IOMMUFD_OBJ_VEVENTQ (vIOMMU Event Queue). Link: https://patch.msgid.link/r/e7336a857954209aabb466e0694aab323da95d90.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 5426a78bebefbb32643ee85320e977f3971c5521) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/fault.c | 111 +++++++++++++----------- drivers/iommu/iommufd/hw_pagetable.c | 6 +- drivers/iommu/iommufd/iommufd_private.h | 28 ++++-- 3 files changed, 82 insertions(+), 63 deletions(-) diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c index 5d8de98732b6f..f8e60e5879d18 100644 --- a/drivers/iommu/iommufd/fault.c +++ b/drivers/iommu/iommufd/fault.c @@ -17,6 +17,8 @@ #include "../iommu-priv.h" #include "iommufd_private.h" +/* IOMMUFD_OBJ_FAULT Functions */ + int iommufd_fault_iopf_enable(struct iommufd_device *idev) { struct device *dev = idev->dev; @@ -73,13 +75,13 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, INIT_LIST_HEAD(&free_list); mutex_lock(&fault->mutex); - spin_lock(&fault->lock); - list_for_each_entry_safe(group, next, &fault->deliver, node) { + spin_lock(&fault->common.lock); + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { if (group->attach_handle != &handle->handle) continue; list_move(&group->node, &free_list); } - spin_unlock(&fault->lock); + spin_unlock(&fault->common.lock); list_for_each_entry_safe(group, next, &free_list, node) { list_del(&group->node); @@ -99,7 +101,9 @@ void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, void iommufd_fault_destroy(struct iommufd_object *obj) { - struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj); + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_fault *fault = eventq_to_fault(eventq); struct iopf_group *group, *next; unsigned long index; @@ -109,7 +113,7 @@ void iommufd_fault_destroy(struct iommufd_object *obj) * accessing this pointer. Therefore, acquiring the mutex here * is unnecessary. */ - list_for_each_entry_safe(group, next, &fault->deliver, node) { + list_for_each_entry_safe(group, next, &fault->common.deliver, node) { list_del(&group->node); iopf_group_response(group, IOMMU_PAGE_RESP_INVALID); iopf_free_group(group); @@ -142,15 +146,15 @@ static void iommufd_compose_fault_message(struct iommu_fault *fault, static struct iopf_group * iommufd_fault_deliver_fetch(struct iommufd_fault *fault) { - struct list_head *list = &fault->deliver; + struct list_head *list = &fault->common.deliver; struct iopf_group *group = NULL; - spin_lock(&fault->lock); + spin_lock(&fault->common.lock); if (!list_empty(list)) { group = list_first_entry(list, struct iopf_group, node); list_del(&group->node); } - spin_unlock(&fault->lock); + spin_unlock(&fault->common.lock); return group; } @@ -158,16 +162,17 @@ iommufd_fault_deliver_fetch(struct iommufd_fault *fault) static void iommufd_fault_deliver_restore(struct iommufd_fault *fault, struct iopf_group *group) { - spin_lock(&fault->lock); - list_add(&group->node, &fault->deliver); - spin_unlock(&fault->lock); + spin_lock(&fault->common.lock); + list_add(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); } static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { size_t fault_size = sizeof(struct iommu_hwpt_pgfault); - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); struct iommu_hwpt_pgfault data = {}; struct iommufd_device *idev; struct iopf_group *group; @@ -216,7 +221,8 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b size_t count, loff_t *ppos) { size_t response_size = sizeof(struct iommu_hwpt_page_response); - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_fault *fault = eventq_to_fault(eventq); struct iommu_hwpt_page_response response; struct iopf_group *group; size_t done = 0; @@ -256,59 +262,61 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b return done == 0 ? rc : done; } -static __poll_t iommufd_fault_fops_poll(struct file *filep, - struct poll_table_struct *wait) +/* Common Event Queue Functions */ + +static __poll_t iommufd_eventq_fops_poll(struct file *filep, + struct poll_table_struct *wait) { - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; __poll_t pollflags = EPOLLOUT; - poll_wait(filep, &fault->wait_queue, wait); - spin_lock(&fault->lock); - if (!list_empty(&fault->deliver)) + poll_wait(filep, &eventq->wait_queue, wait); + spin_lock(&eventq->lock); + if (!list_empty(&eventq->deliver)) pollflags |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fault->lock); + spin_unlock(&eventq->lock); return pollflags; } -static int iommufd_fault_fops_release(struct inode *inode, struct file *filep) +static int iommufd_eventq_fops_release(struct inode *inode, struct file *filep) { - struct iommufd_fault *fault = filep->private_data; + struct iommufd_eventq *eventq = filep->private_data; - refcount_dec(&fault->obj.users); - iommufd_ctx_put(fault->ictx); + refcount_dec(&eventq->obj.users); + iommufd_ctx_put(eventq->ictx); return 0; } -#define INIT_FAULT_FOPS(read_op, write_op) \ +#define INIT_EVENTQ_FOPS(read_op, write_op) \ ((const struct file_operations){ \ .owner = THIS_MODULE, \ .open = nonseekable_open, \ .read = read_op, \ .write = write_op, \ - .poll = iommufd_fault_fops_poll, \ - .release = iommufd_fault_fops_release, \ + .poll = iommufd_eventq_fops_poll, \ + .release = iommufd_eventq_fops_release, \ }) -static int iommufd_fault_init(struct iommufd_fault *fault, char *name, - struct iommufd_ctx *ictx, - const struct file_operations *fops) +static int iommufd_eventq_init(struct iommufd_eventq *eventq, char *name, + struct iommufd_ctx *ictx, + const struct file_operations *fops) { struct file *filep; int fdno; - spin_lock_init(&fault->lock); - INIT_LIST_HEAD(&fault->deliver); - init_waitqueue_head(&fault->wait_queue); + spin_lock_init(&eventq->lock); + INIT_LIST_HEAD(&eventq->deliver); + init_waitqueue_head(&eventq->wait_queue); - filep = anon_inode_getfile(name, fops, fault, O_RDWR); + filep = anon_inode_getfile(name, fops, eventq, O_RDWR); if (IS_ERR(filep)) return PTR_ERR(filep); - fault->ictx = ictx; - iommufd_ctx_get(fault->ictx); - fault->filep = filep; - refcount_inc(&fault->obj.users); + eventq->ictx = ictx; + iommufd_ctx_get(eventq->ictx); + eventq->filep = filep; + refcount_inc(&eventq->obj.users); fdno = get_unused_fd_flags(O_CLOEXEC); if (fdno < 0) @@ -317,7 +325,7 @@ static int iommufd_fault_init(struct iommufd_fault *fault, char *name, } static const struct file_operations iommufd_fault_fops = - INIT_FAULT_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); + INIT_EVENTQ_FOPS(iommufd_fault_fops_read, iommufd_fault_fops_write); int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) { @@ -329,36 +337,37 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) if (cmd->flags) return -EOPNOTSUPP; - fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT); + fault = __iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT, + common.obj); if (IS_ERR(fault)) return PTR_ERR(fault); xa_init_flags(&fault->response, XA_FLAGS_ALLOC1); mutex_init(&fault->mutex); - fdno = iommufd_fault_init(fault, "[iommufd-pgfault]", ucmd->ictx, - &iommufd_fault_fops); + fdno = iommufd_eventq_init(&fault->common, "[iommufd-pgfault]", + ucmd->ictx, &iommufd_fault_fops); if (fdno < 0) { rc = fdno; goto out_abort; } - cmd->out_fault_id = fault->obj.id; + cmd->out_fault_id = fault->common.obj.id; cmd->out_fault_fd = fdno; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_put_fdno; - iommufd_object_finalize(ucmd->ictx, &fault->obj); + iommufd_object_finalize(ucmd->ictx, &fault->common.obj); - fd_install(fdno, fault->filep); + fd_install(fdno, fault->common.filep); return 0; out_put_fdno: put_unused_fd(fdno); - fput(fault->filep); + fput(fault->common.filep); out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj); + iommufd_object_abort_and_destroy(ucmd->ictx, &fault->common.obj); return rc; } @@ -371,11 +380,11 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) hwpt = group->attach_handle->domain->iommufd_hwpt; fault = hwpt->fault; - spin_lock(&fault->lock); - list_add_tail(&group->node, &fault->deliver); - spin_unlock(&fault->lock); + spin_lock(&fault->common.lock); + list_add_tail(&group->node, &fault->common.deliver); + spin_unlock(&fault->common.lock); - wake_up_interruptible(&fault->wait_queue); + wake_up_interruptible(&fault->common.wait_queue); return 0; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 7de6e914232e7..006425e7f6096 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -14,7 +14,7 @@ static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt) iommu_domain_free(hwpt->domain); if (hwpt->fault) - refcount_dec(&hwpt->fault->obj.users); + refcount_dec(&hwpt->fault->common.obj.users); } void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) @@ -409,8 +409,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) } hwpt->fault = fault; hwpt->domain->iopf_handler = iommufd_fault_iopf_handler; - refcount_inc(&fault->obj.users); - iommufd_put_object(ucmd->ictx, &fault->obj); + refcount_inc(&fault->common.obj.users); + iommufd_put_object(ucmd->ictx, &fault->common.obj); } hwpt->domain->iommufd_hwpt = hwpt; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 1c58f5fe17b42..44fb30af10b03 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -454,20 +454,13 @@ void iopt_remove_access(struct io_pagetable *iopt, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); -/* - * An iommufd_fault object represents an interface to deliver I/O page faults - * to the user space. These objects are created/destroyed by the user space and - * associated with hardware page table objects during page-table allocation. - */ -struct iommufd_fault { +struct iommufd_eventq { struct iommufd_object obj; struct iommufd_ctx *ictx; struct file *filep; spinlock_t lock; /* protects the deliver list */ struct list_head deliver; - struct mutex mutex; /* serializes response flows */ - struct xarray response; struct wait_queue_head wait_queue; }; @@ -480,12 +473,29 @@ struct iommufd_attach_handle { /* Convert an iommu attach handle to iommufd handle. */ #define to_iommufd_handle(hdl) container_of(hdl, struct iommufd_attach_handle, handle) +/* + * An iommufd_fault object represents an interface to deliver I/O page faults + * to the user space. These objects are created/destroyed by the user space and + * associated with hardware page table objects during page-table allocation. + */ +struct iommufd_fault { + struct iommufd_eventq common; + struct mutex mutex; /* serializes response flows */ + struct xarray response; +}; + +static inline struct iommufd_fault * +eventq_to_fault(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_fault, common); +} + static inline struct iommufd_fault * iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_FAULT), - struct iommufd_fault, obj); + struct iommufd_fault, common.obj); } int iommufd_fault_alloc(struct iommufd_ucmd *ucmd); From 840eabee09ab3fdfb4f144bdb755493bf7ecb64b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:22 -0700 Subject: [PATCH 021/147] iommufd: Rename fault.c to eventq.c Rename the file, aligning with the new eventq object. Link: https://patch.msgid.link/r/d726397e2d08028e25a1cb6eb9febefac35a32ba.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 0507f337fc0c3a10f802b42834e6532edcf605be) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/Makefile | 2 +- drivers/iommu/iommufd/{fault.c => eventq.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename drivers/iommu/iommufd/{fault.c => eventq.c} (100%) diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index cb784da6cddca..71d692c9a8f49 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only iommufd-y := \ device.o \ - fault.o \ + eventq.o \ hw_pagetable.o \ io_pagetable.o \ ioas.o \ diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/eventq.c similarity index 100% rename from drivers/iommu/iommufd/fault.c rename to drivers/iommu/iommufd/eventq.c From 3bee6ff85c894fb2ec883196660c71d32b21e13c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:23 -0700 Subject: [PATCH 022/147] iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC Introduce a new IOMMUFD_OBJ_VEVENTQ object for vIOMMU Event Queue that provides user space (VMM) another FD to read the vIOMMU Events. Allow a vIOMMU object to allocate vEVENTQs, with a condition that each vIOMMU can only have one single vEVENTQ per type. Add iommufd_veventq_alloc() with iommufd_veventq_ops for the new ioctl. Link: https://patch.msgid.link/r/21acf0751dd5c93846935ee06f93b9c65eff5e04.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit e36ba5ab808ef6237c3148d469c8238674230e2b) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/eventq.c | 209 +++++++++++++++++++++++- drivers/iommu/iommufd/iommufd_private.h | 82 ++++++++++ drivers/iommu/iommufd/main.c | 7 + drivers/iommu/iommufd/viommu.c | 2 + include/linux/iommufd.h | 3 + include/uapi/linux/iommufd.h | 82 ++++++++++ 6 files changed, 384 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c index f8e60e5879d18..4c43ace8c725d 100644 --- a/drivers/iommu/iommufd/eventq.c +++ b/drivers/iommu/iommufd/eventq.c @@ -262,13 +262,148 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b return done == 0 ? rc : done; } +/* IOMMUFD_OBJ_VEVENTQ Functions */ + +void iommufd_veventq_abort(struct iommufd_object *obj) +{ + struct iommufd_eventq *eventq = + container_of(obj, struct iommufd_eventq, obj); + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_viommu *viommu = veventq->viommu; + struct iommufd_vevent *cur, *next; + + lockdep_assert_held_write(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(cur, next, &eventq->deliver, node) { + list_del(&cur->node); + if (cur != &veventq->lost_events_header) + kfree(cur); + } + + refcount_dec(&viommu->obj.users); + list_del(&veventq->node); +} + +void iommufd_veventq_destroy(struct iommufd_object *obj) +{ + struct iommufd_veventq *veventq = eventq_to_veventq( + container_of(obj, struct iommufd_eventq, obj)); + + down_write(&veventq->viommu->veventqs_rwsem); + iommufd_veventq_abort(obj); + up_write(&veventq->viommu->veventqs_rwsem); +} + +static struct iommufd_vevent * +iommufd_veventq_deliver_fetch(struct iommufd_veventq *veventq) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + struct iommufd_vevent *vevent = NULL; + + spin_lock(&eventq->lock); + if (!list_empty(list)) { + struct iommufd_vevent *next; + + next = list_first_entry(list, struct iommufd_vevent, node); + /* Make a copy of the lost_events_header for copy_to_user */ + if (next == &veventq->lost_events_header) { + vevent = kzalloc(sizeof(*vevent), GFP_ATOMIC); + if (!vevent) + goto out_unlock; + } + list_del(&next->node); + if (vevent) + memcpy(vevent, next, sizeof(*vevent)); + else + vevent = next; + } +out_unlock: + spin_unlock(&eventq->lock); + return vevent; +} + +static void iommufd_veventq_deliver_restore(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + struct list_head *list = &eventq->deliver; + + spin_lock(&eventq->lock); + if (vevent_for_lost_events_header(vevent)) { + /* Remove the copy of the lost_events_header */ + kfree(vevent); + vevent = NULL; + /* An empty list needs the lost_events_header back */ + if (list_empty(list)) + vevent = &veventq->lost_events_header; + } + if (vevent) + list_add(&vevent->node, list); + spin_unlock(&eventq->lock); +} + +static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + struct iommufd_eventq *eventq = filep->private_data; + struct iommufd_veventq *veventq = eventq_to_veventq(eventq); + struct iommufd_vevent_header *hdr; + struct iommufd_vevent *cur; + size_t done = 0; + int rc = 0; + + if (*ppos) + return -ESPIPE; + + while ((cur = iommufd_veventq_deliver_fetch(veventq))) { + /* Validate the remaining bytes against the header size */ + if (done >= count || sizeof(*hdr) > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + hdr = &cur->header; + + /* If being a normal vEVENT, validate against the full size */ + if (!vevent_for_lost_events_header(cur) && + sizeof(hdr) + cur->data_len > count - done) { + iommufd_veventq_deliver_restore(veventq, cur); + break; + } + + if (copy_to_user(buf + done, hdr, sizeof(*hdr))) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + done += sizeof(*hdr); + + if (cur->data_len && + copy_to_user(buf + done, cur->event_data, cur->data_len)) { + iommufd_veventq_deliver_restore(veventq, cur); + rc = -EFAULT; + break; + } + spin_lock(&eventq->lock); + veventq->num_events--; + spin_unlock(&eventq->lock); + done += cur->data_len; + kfree(cur); + } + + return done == 0 ? rc : done; +} + /* Common Event Queue Functions */ static __poll_t iommufd_eventq_fops_poll(struct file *filep, struct poll_table_struct *wait) { struct iommufd_eventq *eventq = filep->private_data; - __poll_t pollflags = EPOLLOUT; + __poll_t pollflags = 0; + + if (eventq->obj.type == IOMMUFD_OBJ_FAULT) + pollflags |= EPOLLOUT; poll_wait(filep, &eventq->wait_queue, wait); spin_lock(&eventq->lock); @@ -388,3 +523,75 @@ int iommufd_fault_iopf_handler(struct iopf_group *group) return 0; } + +static const struct file_operations iommufd_veventq_fops = + INIT_EVENTQ_FOPS(iommufd_veventq_fops_read, NULL); + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd) +{ + struct iommu_veventq_alloc *cmd = ucmd->cmd; + struct iommufd_veventq *veventq; + struct iommufd_viommu *viommu; + int fdno; + int rc; + + if (cmd->flags || cmd->__reserved || + cmd->type == IOMMU_VEVENTQ_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->veventq_depth) + return -EINVAL; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + down_write(&viommu->veventqs_rwsem); + + if (iommufd_viommu_find_veventq(viommu, cmd->type)) { + rc = -EEXIST; + goto out_unlock_veventqs; + } + + veventq = __iommufd_object_alloc(ucmd->ictx, veventq, + IOMMUFD_OBJ_VEVENTQ, common.obj); + if (IS_ERR(veventq)) { + rc = PTR_ERR(veventq); + goto out_unlock_veventqs; + } + + veventq->type = cmd->type; + veventq->viommu = viommu; + refcount_inc(&viommu->obj.users); + veventq->depth = cmd->veventq_depth; + list_add_tail(&veventq->node, &viommu->veventqs); + veventq->lost_events_header.header.flags = + IOMMU_VEVENTQ_FLAG_LOST_EVENTS; + + fdno = iommufd_eventq_init(&veventq->common, "[iommufd-viommu-event]", + ucmd->ictx, &iommufd_veventq_fops); + if (fdno < 0) { + rc = fdno; + goto out_abort; + } + + cmd->out_veventq_id = veventq->common.obj.id; + cmd->out_veventq_fd = fdno; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put_fdno; + + iommufd_object_finalize(ucmd->ictx, &veventq->common.obj); + fd_install(fdno, veventq->common.filep); + goto out_unlock_veventqs; + +out_put_fdno: + put_unused_fd(fdno); + fput(veventq->common.filep); +out_abort: + iommufd_object_abort_and_destroy(ucmd->ictx, &veventq->common.obj); +out_unlock_veventqs: + up_write(&viommu->veventqs_rwsem); + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 44fb30af10b03..8cda9c4672ebf 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -507,6 +507,74 @@ void iommufd_fault_iopf_disable(struct iommufd_device *idev); void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle); +/* An iommufd_vevent represents a vIOMMU event in an iommufd_veventq */ +struct iommufd_vevent { + struct iommufd_vevent_header header; + struct list_head node; /* for iommufd_eventq::deliver */ + ssize_t data_len; + u64 event_data[] __counted_by(data_len); +}; + +#define vevent_for_lost_events_header(vevent) \ + (vevent->header.flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS) + +/* + * An iommufd_veventq object represents an interface to deliver vIOMMU events to + * the user space. It is created/destroyed by the user space and associated with + * a vIOMMU object during the allocations. + */ +struct iommufd_veventq { + struct iommufd_eventq common; + struct iommufd_viommu *viommu; + struct list_head node; /* for iommufd_viommu::veventqs */ + struct iommufd_vevent lost_events_header; + + unsigned int type; + unsigned int depth; + + /* Use common.lock for protection */ + u32 num_events; + u32 sequence; +}; + +static inline struct iommufd_veventq * +eventq_to_veventq(struct iommufd_eventq *eventq) +{ + return container_of(eventq, struct iommufd_veventq, common); +} + +static inline struct iommufd_veventq * +iommufd_get_veventq(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_VEVENTQ), + struct iommufd_veventq, common.obj); +} + +int iommufd_veventq_alloc(struct iommufd_ucmd *ucmd); +void iommufd_veventq_destroy(struct iommufd_object *obj); +void iommufd_veventq_abort(struct iommufd_object *obj); + +static inline void iommufd_vevent_handler(struct iommufd_veventq *veventq, + struct iommufd_vevent *vevent) +{ + struct iommufd_eventq *eventq = &veventq->common; + + lockdep_assert_held(&eventq->lock); + + /* + * Remove the lost_events_header and add the new node at the same time. + * Note the new node can be lost_events_header, for a sequence update. + */ + if (list_is_last(&veventq->lost_events_header.node, &eventq->deliver)) + list_del(&veventq->lost_events_header.node); + list_add_tail(&vevent->node, &eventq->deliver); + vevent->header.sequence = veventq->sequence; + veventq->sequence = (veventq->sequence + 1) & INT_MAX; + + wake_up_interruptible(&eventq->wait_queue); +} + static inline struct iommufd_viommu * iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) { @@ -515,6 +583,20 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) struct iommufd_viommu, obj); } +static inline struct iommufd_veventq * +iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type) +{ + struct iommufd_veventq *veventq, *next; + + lockdep_assert_held(&viommu->veventqs_rwsem); + + list_for_each_entry_safe(veventq, next, &viommu->veventqs, node) { + if (veventq->type == type) + return veventq; + } + return NULL; +} + int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index b6fa9fd11bc18..3df468f64e7d9 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -317,6 +317,7 @@ union ucmd_buffer { struct iommu_ioas_unmap unmap; struct iommu_option option; struct iommu_vdevice_alloc vdev; + struct iommu_veventq_alloc veventq; struct iommu_vfio_ioas vfio_ioas; struct iommu_viommu_alloc viommu; #ifdef CONFIG_IOMMUFD_TEST @@ -372,6 +373,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, val64), IOCTL_OP(IOMMU_VDEVICE_ALLOC, iommufd_vdevice_alloc_ioctl, struct iommu_vdevice_alloc, virt_id), + IOCTL_OP(IOMMU_VEVENTQ_ALLOC, iommufd_veventq_alloc, + struct iommu_veventq_alloc, out_veventq_fd), IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, @@ -514,6 +517,10 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_VDEVICE] = { .destroy = iommufd_vdevice_destroy, }, + [IOMMUFD_OBJ_VEVENTQ] = { + .destroy = iommufd_veventq_destroy, + .abort = iommufd_veventq_abort, + }, [IOMMUFD_OBJ_VIOMMU] = { .destroy = iommufd_viommu_destroy, }, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 69b88e8c7c265..01df2b985f02a 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -59,6 +59,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) viommu->ictx = ucmd->ictx; viommu->hwpt = hwpt_paging; refcount_inc(&viommu->hwpt->common.obj.users); + INIT_LIST_HEAD(&viommu->veventqs); + init_rwsem(&viommu->veventqs_rwsem); /* * It is the most likely case that a physical IOMMU is unpluggable. A * pluggable IOMMU instance (if exists) is responsible for refcounting diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 11110c7492009..8948b1836940c 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -34,6 +34,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_FAULT, IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, + IOMMUFD_OBJ_VEVENTQ, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -93,6 +94,8 @@ struct iommufd_viommu { const struct iommufd_viommu_ops *ops; struct xarray vdevs; + struct list_head veventqs; + struct rw_semaphore veventqs_rwsem; unsigned int type; }; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 78747b24bd0fb..dbb8787d9c635 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -55,6 +55,7 @@ enum { IOMMUFD_CMD_VIOMMU_ALLOC = 0x90, IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, + IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, }; /** @@ -1014,4 +1015,85 @@ struct iommu_ioas_change_process { #define IOMMU_IOAS_CHANGE_PROCESS \ _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS) +/** + * enum iommu_veventq_flag - flag for struct iommufd_vevent_header + * @IOMMU_VEVENTQ_FLAG_LOST_EVENTS: vEVENTQ has lost vEVENTs + */ +enum iommu_veventq_flag { + IOMMU_VEVENTQ_FLAG_LOST_EVENTS = (1U << 0), +}; + +/** + * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status + * @flags: Combination of enum iommu_veventq_flag + * @sequence: The sequence index of a vEVENT in the vEVENTQ, with a range of + * [0, INT_MAX] where the following index of INT_MAX is 0 + * + * Each iommufd_vevent_header reports a sequence index of the following vEVENT: + * ------------------------------------------------------------------------- + * | header0 {sequence=0} | data0 | header1 {sequence=1} | data1 |...| dataN | + * ------------------------------------------------------------------------- + * And this sequence index is expected to be monotonic to the sequence index of + * the previous vEVENT. If two adjacent sequence indexes has a delta larger than + * 1, it means that delta - 1 number of vEVENTs has lost, e.g. two lost vEVENTs: + * ------------------------------------------------------------------------- + * | ... | header3 {sequence=3} | data3 | header6 {sequence=6} | data6 | ... | + * ------------------------------------------------------------------------- + * If a vEVENT lost at the tail of the vEVENTQ and there is no following vEVENT + * providing the next sequence index, an IOMMU_VEVENTQ_FLAG_LOST_EVENTS header + * would be added to the tail, and no data would follow this header: + * --------------------------------------------------------------------------- + * |..| header3 {sequence=3} | data3 | header4 {flags=LOST_EVENTS, sequence=4} | + * --------------------------------------------------------------------------- + */ +struct iommufd_vevent_header { + __u32 flags; + __u32 sequence; +}; + +/** + * enum iommu_veventq_type - Virtual Event Queue Type + * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_veventq_type { + IOMMU_VEVENTQ_TYPE_DEFAULT = 0, +}; + +/** + * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) + * @size: sizeof(struct iommu_veventq_alloc) + * @flags: Must be 0 + * @viommu_id: virtual IOMMU ID to associate the vEVENTQ with + * @type: Type of the vEVENTQ. Must be defined in enum iommu_veventq_type + * @veventq_depth: Maximum number of events in the vEVENTQ + * @out_veventq_id: The ID of the new vEVENTQ + * @out_veventq_fd: The fd of the new vEVENTQ. User space must close the + * successfully returned fd after using it + * @__reserved: Must be 0 + * + * Explicitly allocate a virtual event queue interface for a vIOMMU. A vIOMMU + * can have multiple FDs for different types, but is confined to one per @type. + * User space should open the @out_veventq_fd to read vEVENTs out of a vEVENTQ, + * if there are vEVENTs available. A vEVENTQ will lose events due to overflow, + * if the number of the vEVENTs hits @veventq_depth. + * + * Each vEVENT in a vEVENTQ encloses a struct iommufd_vevent_header followed by + * a type-specific data structure, in a normal case: + * ------------------------------------------------------------- + * || header0 | data0 | header1 | data1 | ... | headerN | dataN || + * ------------------------------------------------------------- + * unless a tailing IOMMU_VEVENTQ_FLAG_LOST_EVENTS header is logged (refer to + * struct iommufd_vevent_header). + */ +struct iommu_veventq_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 veventq_depth; + __u32 out_veventq_id; + __u32 out_veventq_fd; + __u32 __reserved; +}; +#define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) #endif From 29c9ed75afbeee2afd848ae1c86bc472961248ec Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:24 -0700 Subject: [PATCH 023/147] iommufd/viommu: Add iommufd_viommu_get_vdev_id helper This is a reverse search v.s. iommufd_viommu_find_dev, as drivers may want to convert a struct device pointer (physical) to its virtual device ID for an event injection to the user space VM. Again, this avoids exposing more core structures to the drivers, than the iommufd_viommu alone. Link: https://patch.msgid.link/r/18b8e8bc1b8104d43b205d21602c036fd0804e56.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit ea94b211c5483080b749c142090f4c4de4926e51) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 24 ++++++++++++++++++++++++ include/linux/iommufd.h | 9 +++++++++ 2 files changed, 33 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 2d98b04ff1cb7..f132b98fb8998 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -49,5 +49,29 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, } EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_dev, "IOMMUFD"); +/* Return -ENOENT if device is not associated to the vIOMMU */ +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id) +{ + struct iommufd_vdevice *vdev; + unsigned long index; + int rc = -ENOENT; + + if (WARN_ON_ONCE(!vdev_id)) + return -EINVAL; + + xa_lock(&viommu->vdevs); + xa_for_each(&viommu->vdevs, index, vdev) { + if (vdev->dev == dev) { + *vdev_id = vdev->id; + rc = 0; + break; + } + } + xa_unlock(&viommu->vdevs); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, "IOMMUFD"); + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 8948b1836940c..05cb393aff0af 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -190,6 +190,8 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, enum iommufd_object_type type); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); +int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, unsigned long *vdev_id); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -203,6 +205,13 @@ iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { return NULL; } + +static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, + struct device *dev, + unsigned long *vdev_id) +{ + return -ENOENT; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* From f9781aa3d97f60c798569dce3b6be7bcc2ae17e3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:25 -0700 Subject: [PATCH 024/147] iommufd/viommu: Add iommufd_viommu_report_event helper Similar to iommu_report_device_fault, this allows IOMMU drivers to report vIOMMU events from threaded IRQ handlers to user space hypervisors. Link: https://patch.msgid.link/r/44be825042c8255e75d0151b338ffd8ba0e4920b.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit e8e1ef9b77a7a09b7809890a52229f24d3c8b532) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 48 ++++++++++++++++++++++++++++++++++ include/linux/iommufd.h | 11 ++++++++ 2 files changed, 59 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index f132b98fb8998..75b365561c161 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -73,5 +73,53 @@ int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, } EXPORT_SYMBOL_NS_GPL(iommufd_viommu_get_vdev_id, "IOMMUFD"); +/* + * Typically called in driver's threaded IRQ handler. + * The @type and @event_data must be defined in include/uapi/linux/iommufd.h + */ +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len) +{ + struct iommufd_veventq *veventq; + struct iommufd_vevent *vevent; + int rc = 0; + + if (WARN_ON_ONCE(!data_len || !event_data)) + return -EINVAL; + + down_read(&viommu->veventqs_rwsem); + + veventq = iommufd_viommu_find_veventq(viommu, type); + if (!veventq) { + rc = -EOPNOTSUPP; + goto out_unlock_veventqs; + } + + spin_lock(&veventq->common.lock); + if (veventq->num_events == veventq->depth) { + vevent = &veventq->lost_events_header; + goto out_set_header; + } + + vevent = kmalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC); + if (!vevent) { + rc = -ENOMEM; + vevent = &veventq->lost_events_header; + goto out_set_header; + } + memcpy(vevent->event_data, event_data, data_len); + vevent->data_len = data_len; + veventq->num_events++; + +out_set_header: + iommufd_vevent_handler(veventq, vevent); + spin_unlock(&veventq->common.lock); +out_unlock_veventqs: + up_read(&viommu->veventqs_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, "IOMMUFD"); + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); MODULE_LICENSE("GPL"); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 05cb393aff0af..60eff9272551d 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -11,6 +11,7 @@ #include #include #include +#include struct device; struct file; @@ -192,6 +193,9 @@ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, struct device *dev, unsigned long *vdev_id); +int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, void *event_data, + size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ static inline struct iommufd_object * _iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, @@ -212,6 +216,13 @@ static inline int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, { return -ENOENT; } + +static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, + enum iommu_veventq_type type, + void *event_data, size_t data_len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ /* From 9117e94749296c914864e5bc4520bc56a94765ea Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:26 -0700 Subject: [PATCH 025/147] iommufd/selftest: Require vdev_id when attaching to a nested domain When attaching a device to a vIOMMU-based nested domain, vdev_id must be present. Add a piece of code hard-requesting it, preparing for a vEVENTQ support in the following patch. Then, update the TEST_F. A HWPT-based nested domain will return a NULL new_viommu, thus no such a vDEVICE requirement. Link: https://patch.msgid.link/r/4051ca8a819e51cb30de6b4fe9e4d94d956afe3d.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 941d0719aa66adb40b96f049635d86b447577970) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/selftest.c | 24 ++++++++++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 5 +++++ 2 files changed, 29 insertions(+) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d40deb0a4f062..ba84bacbce2e4 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -161,7 +161,10 @@ enum selftest_obj_type { struct mock_dev { struct device dev; + struct mock_viommu *viommu; + struct rw_semaphore viommu_rwsem; unsigned long flags; + unsigned long vdev_id; int id; u32 cache[MOCK_DEV_CACHE_NUM]; }; @@ -193,10 +196,30 @@ static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { struct mock_dev *mdev = to_mock_dev(dev); + struct mock_viommu *new_viommu = NULL; + unsigned long vdev_id = 0; + int rc; if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) return -EINVAL; + iommu_group_mutex_assert(dev); + if (domain->type == IOMMU_DOMAIN_NESTED) { + new_viommu = to_mock_nested(domain)->mock_viommu; + if (new_viommu) { + rc = iommufd_viommu_get_vdev_id(&new_viommu->core, dev, + &vdev_id); + if (rc) + return rc; + } + } + if (new_viommu != mdev->viommu) { + down_write(&mdev->viommu_rwsem); + mdev->viommu = new_viommu; + mdev->vdev_id = vdev_id; + up_write(&mdev->viommu_rwsem); + } + return 0; } @@ -850,6 +873,7 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) if (!mdev) return ERR_PTR(-ENOMEM); + init_rwsem(&mdev->viommu_rwsem); device_initialize(&mdev->dev); mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index a1b2b657999dc..212e5d62e13de 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2736,6 +2736,7 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) uint32_t iopf_hwpt_id; uint32_t fault_id; uint32_t fault_fd; + uint32_t vdev_id; if (self->device_id) { test_ioctl_fault_alloc(&fault_id, &fault_fd); @@ -2752,6 +2753,10 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + /* Must allocate vdevice before attaching to a nested hwpt */ + test_err_mock_domain_replace(ENOENT, self->stdev_id, + iopf_hwpt_id); + test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, iopf_hwpt_id)); From 5c3b05344067d4f9ec58906b593a85d127d9afbc Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:27 -0700 Subject: [PATCH 026/147] iommufd/selftest: Add IOMMU_TEST_OP_TRIGGER_VEVENT for vEVENTQ coverage The handler will get vDEVICE object from the given mdev and convert it to its per-vIOMMU virtual ID to mimic a real IOMMU driver. Link: https://patch.msgid.link/r/1ea874d20e56d65e7cfd6e0e8e01bd3dbd038761.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit b3cc0b7599ccc128831fdc0fb71606a246d2a58a) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_test.h | 10 ++++++++++ drivers/iommu/iommufd/selftest.c | 30 ++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index a6b7a163f6364..87e9165cea270 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -24,6 +24,7 @@ enum { IOMMU_TEST_OP_MD_CHECK_IOTLB, IOMMU_TEST_OP_TRIGGER_IOPF, IOMMU_TEST_OP_DEV_CHECK_CACHE, + IOMMU_TEST_OP_TRIGGER_VEVENT, }; enum { @@ -145,6 +146,9 @@ struct iommu_test_cmd { __u32 id; __u32 cache; } check_dev_cache; + struct { + __u32 dev_id; + } trigger_vevent; }; __u32 last; }; @@ -212,4 +216,10 @@ struct iommu_viommu_invalidate_selftest { __u32 cache_id; }; +#define IOMMU_VEVENTQ_TYPE_SELFTEST 0xbeefbeef + +struct iommu_viommu_event_selftest { + __u32 virt_id; +}; + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index ba84bacbce2e4..d55dde28e9bc4 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1621,6 +1621,34 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, return 0; } +static int iommufd_test_trigger_vevent(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct iommu_viommu_event_selftest test = {}; + struct iommufd_device *idev; + struct mock_dev *mdev; + int rc = -ENOENT; + + idev = iommufd_get_device(ucmd, cmd->trigger_vevent.dev_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + mdev = to_mock_dev(idev->dev); + + down_read(&mdev->viommu_rwsem); + if (!mdev->viommu || !mdev->vdev_id) + goto out_unlock; + + test.virt_id = mdev->vdev_id; + rc = iommufd_viommu_report_event(&mdev->viommu->core, + IOMMU_VEVENTQ_TYPE_SELFTEST, &test, + sizeof(test)); +out_unlock: + up_read(&mdev->viommu_rwsem); + iommufd_put_object(ucmd->ictx, &idev->obj); + + return rc; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = to_selftest_obj(obj); @@ -1702,6 +1730,8 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->dirty.flags); case IOMMU_TEST_OP_TRIGGER_IOPF: return iommufd_test_trigger_iopf(ucmd, cmd); + case IOMMU_TEST_OP_TRIGGER_VEVENT: + return iommufd_test_trigger_vevent(ucmd, cmd); default: return -EOPNOTSUPP; } From 0e0203a0fe4112c5ce55aeddcd239553e3e9dc3e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:28 -0700 Subject: [PATCH 027/147] iommufd/selftest: Add IOMMU_VEVENTQ_ALLOC test coverage Trigger vEVENTs by feeding an idev ID and validating the returned output virt_ids whether they equal to the value that was set to the vDEVICE. Link: https://patch.msgid.link/r/e829532ec0a3927d61161b7674b20e731ecd495b.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 97717a1f283fee4e886bbe96c6a0ca460f71a4ab) Signed-off-by: Nirmoy Das --- tools/testing/selftests/iommu/iommufd.c | 31 +++++ .../selftests/iommu/iommufd_fail_nth.c | 7 ++ tools/testing/selftests/iommu/iommufd_utils.h | 115 ++++++++++++++++++ 3 files changed, 153 insertions(+) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 212e5d62e13de..dd453aae8feda 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2774,15 +2774,46 @@ TEST_F(iommufd_viommu, vdevice_alloc) uint32_t viommu_id = self->viommu_id; uint32_t dev_id = self->device_id; uint32_t vdev_id = 0; + uint32_t veventq_id; + uint32_t veventq_fd; + int prev_seq = -1; if (dev_id) { + /* Must allocate vdevice before attaching to a nested hwpt */ + test_err_mock_domain_replace(ENOENT, self->stdev_id, + self->nested_hwpt_id); + + /* Allocate a vEVENTQ with veventq_depth=2 */ + test_cmd_veventq_alloc(viommu_id, IOMMU_VEVENTQ_TYPE_SELFTEST, + &veventq_id, &veventq_fd); + test_err_veventq_alloc(EEXIST, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, NULL, NULL); /* Set vdev_id to 0x99, unset it, and set to 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x99, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, + self->nested_hwpt_id); + test_cmd_trigger_vevents(dev_id, 1); + test_cmd_read_vevents(veventq_fd, 1, 0x99, &prev_seq); test_err_vdevice_alloc(EEXIST, viommu_id, dev_id, 0x99, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(vdev_id); + + /* Try again with 0x88 */ test_cmd_vdevice_alloc(viommu_id, dev_id, 0x88, &vdev_id); + test_cmd_mock_domain_replace(self->stdev_id, + self->nested_hwpt_id); + /* Trigger an overflow with three events */ + test_cmd_trigger_vevents(dev_id, 3); + test_err_read_vevents(EOVERFLOW, veventq_fd, 3, 0x88, + &prev_seq); + /* Overflow must be gone after the previous reads */ + test_cmd_trigger_vevents(dev_id, 1); + test_cmd_read_vevents(veventq_fd, 1, 0x88, &prev_seq); + close(veventq_fd); + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); test_ioctl_destroy(vdev_id); + test_ioctl_destroy(veventq_id); } else { test_err_vdevice_alloc(ENOENT, viommu_id, dev_id, 0x99, NULL); } diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 64b1f8e1b0cf1..99a7f7897bb28 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -620,6 +620,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) }; struct iommu_test_hw_info info; uint32_t fault_id, fault_fd; + uint32_t veventq_id, veventq_fd; uint32_t fault_hwpt_id; uint32_t ioas_id; uint32_t ioas_id2; @@ -692,6 +693,12 @@ TEST_FAIL_NTH(basic_fail_nth, device) IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data))) return -1; + if (_test_cmd_veventq_alloc(self->fd, viommu_id, + IOMMU_VEVENTQ_TYPE_SELFTEST, &veventq_id, + &veventq_fd)) + return -1; + close(veventq_fd); + return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index d979f5b0efe83..6f2ba2fa8f76c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "../kselftest_harness.h" #include "../../../../drivers/iommu/iommufd/iommufd_test.h" @@ -936,3 +937,117 @@ static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, EXPECT_ERRNO(_errno, \ _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \ virt_id, vdev_id)) + +static int _test_cmd_veventq_alloc(int fd, __u32 viommu_id, __u32 type, + __u32 *veventq_id, __u32 *veventq_fd) +{ + struct iommu_veventq_alloc cmd = { + .size = sizeof(cmd), + .type = type, + .veventq_depth = 2, + .viommu_id = viommu_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_VEVENTQ_ALLOC, &cmd); + if (ret) + return ret; + if (veventq_id) + *veventq_id = cmd.out_veventq_id; + if (veventq_fd) + *veventq_fd = cmd.out_veventq_fd; + return 0; +} + +#define test_cmd_veventq_alloc(viommu_id, type, veventq_id, veventq_fd) \ + ASSERT_EQ(0, _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ + veventq_id, veventq_fd)) +#define test_err_veventq_alloc(_errno, viommu_id, type, veventq_id, \ + veventq_fd) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_veventq_alloc(self->fd, viommu_id, type, \ + veventq_id, veventq_fd)) + +static int _test_cmd_trigger_vevents(int fd, __u32 dev_id, __u32 nvevents) +{ + struct iommu_test_cmd trigger_vevent_cmd = { + .size = sizeof(trigger_vevent_cmd), + .op = IOMMU_TEST_OP_TRIGGER_VEVENT, + .trigger_vevent = { + .dev_id = dev_id, + }, + }; + int ret; + + while (nvevents--) { + ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VEVENT), + &trigger_vevent_cmd); + if (ret < 0) + return -1; + } + return ret; +} + +#define test_cmd_trigger_vevents(dev_id, nvevents) \ + ASSERT_EQ(0, _test_cmd_trigger_vevents(self->fd, dev_id, nvevents)) + +static int _test_cmd_read_vevents(int fd, __u32 event_fd, __u32 nvevents, + __u32 virt_id, int *prev_seq) +{ + struct pollfd pollfd = { .fd = event_fd, .events = POLLIN }; + struct iommu_viommu_event_selftest *event; + struct iommufd_vevent_header *hdr; + ssize_t bytes; + void *data; + int ret, i; + + ret = poll(&pollfd, 1, 1000); + if (ret < 0) + return -1; + + data = calloc(nvevents, sizeof(*hdr) + sizeof(*event)); + if (!data) { + errno = ENOMEM; + return -1; + } + + bytes = read(event_fd, data, + nvevents * (sizeof(*hdr) + sizeof(*event))); + if (bytes <= 0) { + errno = EFAULT; + ret = -1; + goto out_free; + } + + for (i = 0; i < nvevents; i++) { + hdr = data + i * (sizeof(*hdr) + sizeof(*event)); + + if (hdr->flags & IOMMU_VEVENTQ_FLAG_LOST_EVENTS || + hdr->sequence - *prev_seq > 1) { + *prev_seq = hdr->sequence; + errno = EOVERFLOW; + ret = -1; + goto out_free; + } + *prev_seq = hdr->sequence; + event = data + sizeof(*hdr); + if (event->virt_id != virt_id) { + errno = EINVAL; + ret = -1; + goto out_free; + } + } + + ret = 0; +out_free: + free(data); + return ret; +} + +#define test_cmd_read_vevents(event_fd, nvevents, virt_id, prev_seq) \ + ASSERT_EQ(0, _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ + virt_id, prev_seq)) +#define test_err_read_vevents(_errno, event_fd, nvevents, virt_id, prev_seq) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ + virt_id, prev_seq)) From ee3e6a9017df17e88aa98ec9d5cd987cbd925720 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:29 -0700 Subject: [PATCH 028/147] Documentation: userspace-api: iommufd: Update FAULT and VEVENTQ With the introduction of the new objects, update the doc to reflect that. Link: https://patch.msgid.link/r/09829fbc218872d242323d8834da4bec187ce6f4.1741719725.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Bagas Sanjaya Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 2ec0458eb0e5a84d80f82667e358c4f2c187d2e4) Signed-off-by: Nirmoy Das --- Documentation/userspace-api/iommufd.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst index 70289d6815d2a..b0df15865dec6 100644 --- a/Documentation/userspace-api/iommufd.rst +++ b/Documentation/userspace-api/iommufd.rst @@ -63,6 +63,13 @@ Following IOMMUFD objects are exposed to userspace: space usually has mappings from guest-level I/O virtual addresses to guest- level physical addresses. +- IOMMUFD_FAULT, representing a software queue for an HWPT reporting IO page + faults using the IOMMU HW's PRI (Page Request Interface). This queue object + provides user space an FD to poll the page fault events and also to respond + to those events. A FAULT object must be created first to get a fault_id that + could be then used to allocate a fault-enabled HWPT via the IOMMU_HWPT_ALLOC + command by setting the IOMMU_HWPT_FAULT_ID_VALID bit in its flags field. + - IOMMUFD_OBJ_VIOMMU, representing a slice of the physical IOMMU instance, passed to or shared with a VM. It may be some HW-accelerated virtualization features and some SW resources used by the VM. For examples: @@ -109,6 +116,14 @@ Following IOMMUFD objects are exposed to userspace: vIOMMU, which is a separate ioctl call from attaching the same device to an HWPT_PAGING that the vIOMMU holds. +- IOMMUFD_OBJ_VEVENTQ, representing a software queue for a vIOMMU to report its + events such as translation faults occurred to a nested stage-1 (excluding I/O + page faults that should go through IOMMUFD_OBJ_FAULT) and HW-specific events. + This queue object provides user space an FD to poll/read the vIOMMU events. A + vIOMMU object must be created first to get its viommu_id, which could be then + used to allocate a vEVENTQ. Each vIOMMU can support multiple types of vEVENTS, + but is confined to one vEVENTQ per vEVENTQ type. + All user-visible objects are destroyed via the IOMMU_DESTROY uAPI. The diagrams below show relationships between user-visible objects and kernel @@ -251,8 +266,10 @@ User visible objects are backed by following datastructures: - iommufd_device for IOMMUFD_OBJ_DEVICE. - iommufd_hwpt_paging for IOMMUFD_OBJ_HWPT_PAGING. - iommufd_hwpt_nested for IOMMUFD_OBJ_HWPT_NESTED. +- iommufd_fault for IOMMUFD_OBJ_FAULT. - iommufd_viommu for IOMMUFD_OBJ_VIOMMU. - iommufd_vdevice for IOMMUFD_OBJ_VDEVICE. +- iommufd_veventq for IOMMUFD_OBJ_VEVENTQ. Several terminologies when looking at these datastructures: From 1cb13ea7037258af983872cc2520e5aa04d736ac Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:30 -0700 Subject: [PATCH 029/147] iommu/arm-smmu-v3: Introduce struct arm_smmu_vmaster Use it to store all vSMMU-related data. The vsid (Virtual Stream ID) will be the first use case. Since the vsid reader will be the eventq handler that already holds a streams_mutex, reuse that to fence the vmaster too. Also add a pair of arm_smmu_attach_prepare/commit_vmaster helpers to set or unset the master->vmaster pointer. Put the helpers inside the existing arm_smmu_attach_prepare/commit(). For identity/blocked ops that don't call arm_smmu_attach_prepare/commit(), add a simpler arm_smmu_master_clear_vmaster helper to unset the vmaster. Link: https://patch.msgid.link/r/a7f282e1a531279e25f06c651e95d56f6b120886.1741719725.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit f0ea207ed7813bdf17e65aa042d023d1c59e49e3) Signed-off-by: Nirmoy Das --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 41 +++++++++++++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 18 +++++++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 28 +++++++++++++ 3 files changed, 86 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 34a0be59cd919..15b399f5298be 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -87,6 +87,47 @@ static void arm_smmu_make_nested_domain_ste( } } +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + struct arm_smmu_vmaster *vmaster; + unsigned long vsid; + int ret; + + iommu_group_mutex_assert(state->master->dev); + + ret = iommufd_viommu_get_vdev_id(&nested_domain->vsmmu->core, + state->master->dev, &vsid); + if (ret) + return ret; + + vmaster = kzalloc(sizeof(*vmaster), GFP_KERNEL); + if (!vmaster) + return -ENOMEM; + vmaster->vsmmu = nested_domain->vsmmu; + vmaster->vsid = vsid; + state->vmaster = vmaster; + + return 0; +} + +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ + struct arm_smmu_master *master = state->master; + + mutex_lock(&master->smmu->streams_mutex); + kfree(master->vmaster); + master->vmaster = state->vmaster; + mutex_unlock(&master->smmu->streams_mutex); +} + +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ + struct arm_smmu_attach_state state = { .master = master }; + + arm_smmu_attach_commit_vmaster(&state); +} + static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, struct device *dev) { diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index e495334d1c43a..cb5d85eb3b7fa 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -2803,6 +2803,7 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(new_domain); unsigned long flags; + int ret; /* * arm_smmu_share_asid() must not see two domains pointing to the same @@ -2832,9 +2833,18 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, } if (smmu_domain) { + if (new_domain->type == IOMMU_DOMAIN_NESTED) { + ret = arm_smmu_attach_prepare_vmaster( + state, to_smmu_nested_domain(new_domain)); + if (ret) + return ret; + } + master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL); - if (!master_domain) + if (!master_domain) { + kfree(state->vmaster); return -ENOMEM; + } master_domain->master = master; master_domain->ssid = state->ssid; if (new_domain->type == IOMMU_DOMAIN_NESTED) @@ -2861,6 +2871,7 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state, spin_unlock_irqrestore(&smmu_domain->devices_lock, flags); kfree(master_domain); + kfree(state->vmaster); return -EINVAL; } @@ -2893,6 +2904,8 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state) lockdep_assert_held(&arm_smmu_asid_lock); + arm_smmu_attach_commit_vmaster(state); + if (state->ats_enabled && !master->ats_enabled) { arm_smmu_enable_ats(master); } else if (state->ats_enabled && master->ats_enabled) { @@ -3162,6 +3175,7 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, struct arm_smmu_ste ste; struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_bypass_ste(master->smmu, &ste); arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); return 0; @@ -3180,7 +3194,9 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, struct device *dev) { struct arm_smmu_ste ste; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + arm_smmu_master_clear_vmaster(master); arm_smmu_make_abort_ste(&ste); arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_TERMINATE); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 7290bd4c2bb0a..639dcef2ca2c8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -800,6 +800,11 @@ struct arm_smmu_stream { struct rb_node node; }; +struct arm_smmu_vmaster { + struct arm_vsmmu *vsmmu; + unsigned long vsid; +}; + struct arm_smmu_event { u8 stall : 1, ssv : 1, @@ -825,6 +830,7 @@ struct arm_smmu_master { struct arm_smmu_device *smmu; struct device *dev; struct arm_smmu_stream *streams; + struct arm_smmu_vmaster *vmaster; /* use smmu->streams_mutex */ /* Locked by the iommu core using the group mutex */ struct arm_smmu_ctx_desc_cfg cd_table; unsigned int num_streams; @@ -973,6 +979,7 @@ struct arm_smmu_attach_state { bool disable_ats; ioasid_t ssid; /* Resulting state */ + struct arm_smmu_vmaster *vmaster; bool ats_enabled; }; @@ -1056,9 +1063,30 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, struct iommu_domain *parent, struct iommufd_ctx *ictx, unsigned int viommu_type); +int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain); +void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); +void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); #else #define arm_smmu_hw_info NULL #define arm_vsmmu_alloc NULL + +static inline int +arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, + struct arm_smmu_nested_domain *nested_domain) +{ + return 0; +} + +static inline void +arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state) +{ +} + +static inline void +arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) +{ +} #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ #endif /* _ARM_SMMU_V3_H */ From 2295f18bac1627234225f3714c73f0a1bf38dd7a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 11 Mar 2025 12:44:31 -0700 Subject: [PATCH 030/147] iommu/arm-smmu-v3: Report events that belong to devices attached to vIOMMU Aside from the IOPF framework, iommufd provides an additional pathway to report hardware events, via the vEVENTQ of vIOMMU infrastructure. Define an iommu_vevent_arm_smmuv3 uAPI structure, and report stage-1 events in the threaded IRQ handler. Also, add another four event record types that can be forwarded to a VM. Link: https://patch.msgid.link/r/5cf6719682fdfdabffdb08374cdf31ad2466d75a.1741719725.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit e7d3fa3d29d5b2ed12d247cf57a0a34fffe89eb8) Signed-off-by: Nirmoy Das --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 17 ++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 58 +++++++++++-------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +++ include/uapi/linux/iommufd.h | 23 ++++++++ 4 files changed, 80 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 15b399f5298be..e4fd8d522af88 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -435,4 +435,21 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, return &vsmmu->core; } +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt) +{ + struct iommu_vevent_arm_smmuv3 vevt; + int i; + + lockdep_assert_held(&vmaster->vsmmu->smmu->streams_mutex); + + vevt.evt[0] = cpu_to_le64((evt[0] & ~EVTQ_0_SID) | + FIELD_PREP(EVTQ_0_SID, vmaster->vsid)); + for (i = 1; i < EVTQ_ENT_DWORDS; i++) + vevt.evt[i] = cpu_to_le64(evt[i]); + + return iommufd_viommu_report_event(&vmaster->vsmmu->core, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3, &vevt, + sizeof(vevt)); +} + MODULE_IMPORT_NS("IOMMUFD"); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index cb5d85eb3b7fa..0826b6bdf327f 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1813,8 +1813,8 @@ static void arm_smmu_decode_event(struct arm_smmu_device *smmu, u64 *raw, mutex_unlock(&smmu->streams_mutex); } -static int arm_smmu_handle_event(struct arm_smmu_device *smmu, - struct arm_smmu_event *event) +static int arm_smmu_handle_event(struct arm_smmu_device *smmu, u64 *evt, + struct arm_smmu_event *event) { int ret = 0; u32 perm = 0; @@ -1823,6 +1823,10 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, struct iommu_fault *flt = &fault_evt.fault; switch (event->id) { + case EVT_ID_BAD_STE_CONFIG: + case EVT_ID_STREAM_DISABLED_FAULT: + case EVT_ID_BAD_SUBSTREAMID_CONFIG: + case EVT_ID_BAD_CD_CONFIG: case EVT_ID_TRANSLATION_FAULT: case EVT_ID_ADDR_SIZE_FAULT: case EVT_ID_ACCESS_FAULT: @@ -1832,31 +1836,30 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, return -EOPNOTSUPP; } - if (!event->stall) - return -EOPNOTSUPP; - - if (event->read) - perm |= IOMMU_FAULT_PERM_READ; - else - perm |= IOMMU_FAULT_PERM_WRITE; + if (event->stall) { + if (event->read) + perm |= IOMMU_FAULT_PERM_READ; + else + perm |= IOMMU_FAULT_PERM_WRITE; - if (event->instruction) - perm |= IOMMU_FAULT_PERM_EXEC; + if (event->instruction) + perm |= IOMMU_FAULT_PERM_EXEC; - if (event->privileged) - perm |= IOMMU_FAULT_PERM_PRIV; + if (event->privileged) + perm |= IOMMU_FAULT_PERM_PRIV; - flt->type = IOMMU_FAULT_PAGE_REQ; - flt->prm = (struct iommu_fault_page_request) { - .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, - .grpid = event->stag, - .perm = perm, - .addr = event->iova, - }; + flt->type = IOMMU_FAULT_PAGE_REQ; + flt->prm = (struct iommu_fault_page_request){ + .flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE, + .grpid = event->stag, + .perm = perm, + .addr = event->iova, + }; - if (event->ssv) { - flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; - flt->prm.pasid = event->ssid; + if (event->ssv) { + flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID; + flt->prm.pasid = event->ssid; + } } mutex_lock(&smmu->streams_mutex); @@ -1866,7 +1869,12 @@ static int arm_smmu_handle_event(struct arm_smmu_device *smmu, goto out_unlock; } - ret = iommu_report_device_fault(master->dev, &fault_evt); + if (event->stall) + ret = iommu_report_device_fault(master->dev, &fault_evt); + else if (master->vmaster && !event->s2) + ret = arm_vmaster_report_event(master->vmaster, evt); + else + ret = -EOPNOTSUPP; /* Unhandled events should be pinned */ out_unlock: mutex_unlock(&smmu->streams_mutex); return ret; @@ -1944,7 +1952,7 @@ static irqreturn_t arm_smmu_evtq_thread(int irq, void *dev) do { while (!queue_remove_raw(q, evt)) { arm_smmu_decode_event(smmu, evt, &event); - if (arm_smmu_handle_event(smmu, &event)) + if (arm_smmu_handle_event(smmu, evt, &event)) arm_smmu_dump_event(smmu, evt, &event, &rs); put_device(event.dev); diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 639dcef2ca2c8..dd1ad56ce8639 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1067,6 +1067,7 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, struct arm_smmu_nested_domain *nested_domain); void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); +int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); #else #define arm_smmu_hw_info NULL #define arm_vsmmu_alloc NULL @@ -1087,6 +1088,12 @@ static inline void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master) { } + +static inline int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, + u64 *evt) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_ARM_SMMU_V3_IOMMUFD */ #endif /* _ARM_SMMU_V3_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index dbb8787d9c635..8719d4f5d6183 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1054,9 +1054,32 @@ struct iommufd_vevent_header { /** * enum iommu_veventq_type - Virtual Event Queue Type * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use + * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue */ enum iommu_veventq_type { IOMMU_VEVENTQ_TYPE_DEFAULT = 0, + IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, +}; + +/** + * struct iommu_vevent_arm_smmuv3 - ARM SMMUv3 Virtual Event + * (IOMMU_VEVENTQ_TYPE_ARM_SMMUV3) + * @evt: 256-bit ARM SMMUv3 Event record, little-endian. + * Reported event records: (Refer to "7.3 Event records" in SMMUv3 HW Spec) + * - 0x04 C_BAD_STE + * - 0x06 F_STREAM_DISABLED + * - 0x08 C_BAD_SUBSTREAMID + * - 0x0a C_BAD_CD + * - 0x10 F_TRANSLATION + * - 0x11 F_ADDR_SIZE + * - 0x12 F_ACCESS + * - 0x13 F_PERMISSION + * + * StreamID field reports a virtual device ID. To receive a virtual event for a + * device, a vDEVICE must be allocated via IOMMU_VDEVICE_ALLOC. + */ +struct iommu_vevent_arm_smmuv3 { + __aligned_le64 evt[4]; }; /** From 9187b94141b7317cd376ef63d7a5d54ba78c4722 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 26 Feb 2025 02:40:12 -0800 Subject: [PATCH 031/147] iommufd: Disallow allocating nested parent domain with fault ID Allocating a domain with a fault ID indicates that the domain is faultable. However, there is a gap for the nested parent domain to support PRI. Some hardware lacks the capability to distinguish whether PRI occurs at stage 1 or stage 2. This limitation may require software-based page table walking to resolve. Since no in-tree IOMMU driver currently supports this functionality, it is disallowed. For more details, refer to the related discussion at [1]. [1] https://lore.kernel.org/linux-iommu/bd1655c6-8b2f-4cfa-adb1-badc00d01811@intel.com/ Link: https://patch.msgid.link/r/20250226104012.82079-1-yi.l.liu@intel.com Suggested-by: Lu Baolu Signed-off-by: Yi Liu Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit 1062d81086156e42878d701b816d2f368b53a77c) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/hw_pagetable.c | 3 +++ tools/testing/selftests/iommu/iommufd.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 006425e7f6096..ee40235f92850 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -126,6 +126,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) && !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) return ERR_PTR(-EOPNOTSUPP); + if ((flags & IOMMU_HWPT_FAULT_ID_VALID) && + (flags & IOMMU_HWPT_ALLOC_NEST_PARENT)) + return ERR_PTR(-EOPNOTSUPP); hwpt_paging = __iommufd_object_alloc( ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index dd453aae8feda..156c74da53cd7 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -439,6 +439,10 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested) &test_hwpt_id); test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0, &test_hwpt_id); + test_err_hwpt_alloc(EOPNOTSUPP, self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_FAULT_ID_VALID, + &test_hwpt_id); test_cmd_hwpt_alloc(self->device_id, self->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, From d2dc49b339d2de31120d003e39e7990191ef6e77 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 5 Mar 2025 13:18:00 -0800 Subject: [PATCH 032/147] iommufd: Set domain->iommufd_hwpt in all hwpt->domain allocators Setting domain->iommufd_hwpt in iommufd_hwpt_alloc() only covers the HWPT allocations from user space, but not for an auto domain. This resulted in a NULL pointer access in the auto domain pathway: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 pc : iommufd_sw_msi+0x54/0x2b0 lr : iommufd_sw_msi+0x40/0x2b0 Call trace: iommufd_sw_msi+0x54/0x2b0 (P) iommu_dma_prepare_msi+0x64/0xa8 its_irq_domain_alloc+0xf0/0x2c0 irq_domain_alloc_irqs_parent+0x2c/0xa8 msi_domain_alloc+0xa0/0x1a8 Since iommufd_sw_msi() requires to access the domain->iommufd_hwpt, it is better to set that explicitly prior to calling iommu_domain_set_sw_msi(). Fixes: 748706d7ca06 ("iommu: Turn fault_data to iommufd private pointer") Link: https://patch.msgid.link/r/20250305211800.229465-1-nicolinc@nvidia.com Reported-by: Ankit Agrawal Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Ankit Agrawal Signed-off-by: Jason Gunthorpe (cherry picked from commit 897008d0f7672c3510281e826232a32d62710323) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/hw_pagetable.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index ee40235f92850..9a89f3a28dc52 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -159,6 +159,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } } + hwpt->domain->iommufd_hwpt = hwpt; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* @@ -255,6 +256,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, goto out_abort; } hwpt->domain->owner = ops; + hwpt->domain->iommufd_hwpt = hwpt; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { @@ -311,6 +313,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, hwpt->domain = NULL; goto out_abort; } + hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->owner = viommu->iommu_dev->ops; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); @@ -415,7 +418,6 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) refcount_inc(&fault->common.obj.users); iommufd_put_object(ucmd->ictx, &fault->common.obj); } - hwpt->domain->iommufd_hwpt = hwpt; cmd->out_hwpt_id = hwpt->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); From 7a0d84a0a3fa5d3e5487e17069b6ac2a750fe53a Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 24 Mar 2025 21:05:15 -0700 Subject: [PATCH 033/147] iommu: Sort out domain user data When DMA/MSI cookies were made first-class citizens back in commit 46983fcd67ac ("iommu: Pull IOVA cookie management into the core"), there was no real need to further expose the two different cookie types. However, now that IOMMUFD wants to add a third type of MSI-mapping cookie, we do have a nicely compelling reason to properly dismabiguate things at the domain level beyond just vaguely guessing from the domain type. Meanwhile, we also effectively have another "cookie" in the form of the anonymous union for other user data, which isn't much better in terms of being vague and unenforced. The fact is that all these cookie types are mutually exclusive, in the sense that combining them makes zero sense and/or would be catastrophic (iommu_set_fault_handler() on an SVA domain, anyone?) - the only combination which *might* be reasonable is perhaps a fault handler and an MSI cookie, but nobody's doing that at the moment, so let's rule it out as well for the sake of being clear and robust. To that end, we pull DMA and MSI cookies apart a little more, mostly to clear up the ambiguity at domain teardown, then for clarity (and to save a little space), move them into the union, whose ownership we can then properly describe and enforce entirely unambiguously. [nicolinc: rebase on latest tree; use prefix IOMMU_COOKIE_; merge unions in iommu_domain; add IOMMU_COOKIE_IOMMUFD for iommufd_hwpt] Link: https://patch.msgid.link/r/1ace9076c95204bbe193ee77499d395f15f44b23.1742871535.git.nicolinc@nvidia.com Signed-off-by: Robin Murphy Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe (cherry picked from commit 6aa63a4ec947f350d1a2f9f6aba8591a2455d192) Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 194 ++++++++++++++------------- drivers/iommu/dma-iommu.h | 5 + drivers/iommu/iommu-sva.c | 1 + drivers/iommu/iommu.c | 18 ++- drivers/iommu/iommufd/hw_pagetable.c | 3 + include/linux/iommu.h | 20 ++- 6 files changed, 143 insertions(+), 98 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 94263ed2c5644..31a7b4b816563 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -42,11 +42,6 @@ struct iommu_dma_msi_page { phys_addr_t phys; }; -enum iommu_dma_cookie_type { - IOMMU_DMA_IOVA_COOKIE, - IOMMU_DMA_MSI_COOKIE, -}; - enum iommu_dma_queue_type { IOMMU_DMA_OPTS_PER_CPU_QUEUE, IOMMU_DMA_OPTS_SINGLE_QUEUE, @@ -59,35 +54,31 @@ struct iommu_dma_options { }; struct iommu_dma_cookie { - enum iommu_dma_cookie_type type; + struct iova_domain iovad; + struct list_head msi_page_list; + /* Flush queue */ union { - /* Full allocator for IOMMU_DMA_IOVA_COOKIE */ - struct { - struct iova_domain iovad; - /* Flush queue */ - union { - struct iova_fq *single_fq; - struct iova_fq __percpu *percpu_fq; - }; - /* Number of TLB flushes that have been started */ - atomic64_t fq_flush_start_cnt; - /* Number of TLB flushes that have been finished */ - atomic64_t fq_flush_finish_cnt; - /* Timer to regularily empty the flush queues */ - struct timer_list fq_timer; - /* 1 when timer is active, 0 when not */ - atomic_t fq_timer_on; - }; - /* Trivial linear page allocator for IOMMU_DMA_MSI_COOKIE */ - dma_addr_t msi_iova; + struct iova_fq *single_fq; + struct iova_fq __percpu *percpu_fq; }; - struct list_head msi_page_list; - + /* Number of TLB flushes that have been started */ + atomic64_t fq_flush_start_cnt; + /* Number of TLB flushes that have been finished */ + atomic64_t fq_flush_finish_cnt; + /* Timer to regularily empty the flush queues */ + struct timer_list fq_timer; + /* 1 when timer is active, 0 when not */ + atomic_t fq_timer_on; /* Domain for flush queue callback; NULL if flush queue not in use */ - struct iommu_domain *fq_domain; + struct iommu_domain *fq_domain; /* Options for dma-iommu use */ - struct iommu_dma_options options; - struct mutex mutex; + struct iommu_dma_options options; + struct mutex mutex; +}; + +struct iommu_dma_msi_cookie { + dma_addr_t msi_iova; + struct list_head msi_page_list; }; static DEFINE_STATIC_KEY_FALSE(iommu_deferred_attach_enabled); @@ -369,40 +360,26 @@ int iommu_dma_init_fq(struct iommu_domain *domain) return 0; } -static inline size_t cookie_msi_granule(struct iommu_dma_cookie *cookie) -{ - if (cookie->type == IOMMU_DMA_IOVA_COOKIE) - return cookie->iovad.granule; - return PAGE_SIZE; -} - -static struct iommu_dma_cookie *cookie_alloc(enum iommu_dma_cookie_type type) -{ - struct iommu_dma_cookie *cookie; - - cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); - if (cookie) { - INIT_LIST_HEAD(&cookie->msi_page_list); - cookie->type = type; - } - return cookie; -} - /** * iommu_get_dma_cookie - Acquire DMA-API resources for a domain * @domain: IOMMU domain to prepare for DMA-API usage */ int iommu_get_dma_cookie(struct iommu_domain *domain) { - if (domain->iova_cookie) + struct iommu_dma_cookie *cookie; + + if (domain->cookie_type != IOMMU_COOKIE_NONE) return -EEXIST; - domain->iova_cookie = cookie_alloc(IOMMU_DMA_IOVA_COOKIE); - if (!domain->iova_cookie) + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); + if (!cookie) return -ENOMEM; - mutex_init(&domain->iova_cookie->mutex); + mutex_init(&cookie->mutex); + INIT_LIST_HEAD(&cookie->msi_page_list); iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); + domain->cookie_type = IOMMU_COOKIE_DMA_IOVA; + domain->iova_cookie = cookie; return 0; } @@ -420,29 +397,30 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) */ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) { - struct iommu_dma_cookie *cookie; + struct iommu_dma_msi_cookie *cookie; if (domain->type != IOMMU_DOMAIN_UNMANAGED) return -EINVAL; - if (domain->iova_cookie) + if (domain->cookie_type != IOMMU_COOKIE_NONE) return -EEXIST; - cookie = cookie_alloc(IOMMU_DMA_MSI_COOKIE); + cookie = kzalloc(sizeof(*cookie), GFP_KERNEL); if (!cookie) return -ENOMEM; cookie->msi_iova = base; - domain->iova_cookie = cookie; + INIT_LIST_HEAD(&cookie->msi_page_list); iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); + domain->cookie_type = IOMMU_COOKIE_DMA_MSI; + domain->msi_cookie = cookie; return 0; } EXPORT_SYMBOL(iommu_get_msi_cookie); /** * iommu_put_dma_cookie - Release a domain's DMA mapping resources - * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() or - * iommu_get_msi_cookie() + * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie() */ void iommu_put_dma_cookie(struct iommu_domain *domain) { @@ -454,20 +432,27 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) return; #endif - if (!cookie) - return; - - if (cookie->type == IOMMU_DMA_IOVA_COOKIE && cookie->iovad.granule) { + if (cookie->iovad.granule) { iommu_dma_free_fq(cookie); put_iova_domain(&cookie->iovad); } + list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) + kfree(msi); + kfree(cookie); +} - list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) { - list_del(&msi->list); +/** + * iommu_put_msi_cookie - Release a domain's MSI mapping resources + * @domain: IOMMU domain previously prepared by iommu_get_msi_cookie() + */ +void iommu_put_msi_cookie(struct iommu_domain *domain) +{ + struct iommu_dma_msi_cookie *cookie = domain->msi_cookie; + struct iommu_dma_msi_page *msi, *tmp; + + list_for_each_entry_safe(msi, tmp, &cookie->msi_page_list, list) kfree(msi); - } kfree(cookie); - domain->iova_cookie = NULL; } /** @@ -687,7 +672,7 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, struct device *dev struct iova_domain *iovad; int ret; - if (!cookie || cookie->type != IOMMU_DMA_IOVA_COOKIE) + if (!cookie || domain->cookie_type != IOMMU_COOKIE_DMA_IOVA) return -EINVAL; iovad = &cookie->iovad; @@ -777,9 +762,9 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, struct iova_domain *iovad = &cookie->iovad; unsigned long shift, iova_len, iova; - if (cookie->type == IOMMU_DMA_MSI_COOKIE) { - cookie->msi_iova += size; - return cookie->msi_iova - size; + if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) { + domain->msi_cookie->msi_iova += size; + return domain->msi_cookie->msi_iova - size; } shift = iova_shift(iovad); @@ -816,16 +801,16 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, return (dma_addr_t)iova << shift; } -static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, - dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather) +static void iommu_dma_free_iova(struct iommu_domain *domain, dma_addr_t iova, + size_t size, struct iommu_iotlb_gather *gather) { - struct iova_domain *iovad = &cookie->iovad; + struct iova_domain *iovad = &domain->iova_cookie->iovad; /* The MSI case is only ever cleaning up its most recent allocation */ - if (cookie->type == IOMMU_DMA_MSI_COOKIE) - cookie->msi_iova -= size; + if (domain->cookie_type == IOMMU_COOKIE_DMA_MSI) + domain->msi_cookie->msi_iova -= size; else if (gather && gather->queued) - queue_iova(cookie, iova_pfn(iovad, iova), + queue_iova(domain->iova_cookie, iova_pfn(iovad, iova), size >> iova_shift(iovad), &gather->freelist); else @@ -853,7 +838,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + iommu_dma_free_iova(domain, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, @@ -881,7 +866,7 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -1018,7 +1003,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, out_free_sg: sg_free_table(sgt); out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); out_free_pages: __iommu_dma_free_pages(pages, count); return NULL; @@ -1495,7 +1480,7 @@ int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return __finalise_sg(dev, sg, nents, iova); out_free_iova: - iommu_dma_free_iova(cookie, iova, iova_len, NULL); + iommu_dma_free_iova(domain, iova, iova_len, NULL); out_restore_sg: __invalidate_sg(sg, nents); out: @@ -1773,17 +1758,47 @@ void iommu_setup_dma_ops(struct device *dev) dev->dma_iommu = false; } +static bool has_msi_cookie(const struct iommu_domain *domain) +{ + return domain && (domain->cookie_type == IOMMU_COOKIE_DMA_IOVA || + domain->cookie_type == IOMMU_COOKIE_DMA_MSI); +} + +static size_t cookie_msi_granule(const struct iommu_domain *domain) +{ + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + return domain->iova_cookie->iovad.granule; + case IOMMU_COOKIE_DMA_MSI: + return PAGE_SIZE; + default: + unreachable(); + }; +} + +static struct list_head *cookie_msi_pages(const struct iommu_domain *domain) +{ + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + return &domain->iova_cookie->msi_page_list; + case IOMMU_COOKIE_DMA_MSI: + return &domain->msi_cookie->msi_page_list; + default: + unreachable(); + }; +} + static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, phys_addr_t msi_addr, struct iommu_domain *domain) { - struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct list_head *msi_page_list = cookie_msi_pages(domain); struct iommu_dma_msi_page *msi_page; dma_addr_t iova; int prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO; - size_t size = cookie_msi_granule(cookie); + size_t size = cookie_msi_granule(domain); msi_addr &= ~(phys_addr_t)(size - 1); - list_for_each_entry(msi_page, &cookie->msi_page_list, list) + list_for_each_entry(msi_page, msi_page_list, list) if (msi_page->phys == msi_addr) return msi_page; @@ -1801,11 +1816,11 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, INIT_LIST_HEAD(&msi_page->list); msi_page->phys = msi_addr; msi_page->iova = iova; - list_add(&msi_page->list, &cookie->msi_page_list); + list_add(&msi_page->list, msi_page_list); return msi_page; out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + iommu_dma_free_iova(domain, iova, size, NULL); out_free_page: kfree(msi_page); return NULL; @@ -1817,7 +1832,7 @@ static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, struct device *dev = msi_desc_to_dev(desc); const struct iommu_dma_msi_page *msi_page; - if (!domain->iova_cookie) { + if (!has_msi_cookie(domain)) { msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } @@ -1827,9 +1842,8 @@ static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, if (!msi_page) return -ENOMEM; - msi_desc_set_iommu_msi_iova( - desc, msi_page->iova, - ilog2(cookie_msi_granule(domain->iova_cookie))); + msi_desc_set_iommu_msi_iova(desc, msi_page->iova, + ilog2(cookie_msi_granule(domain))); return 0; } diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h index c12d63457c764..9cca11806e5d0 100644 --- a/drivers/iommu/dma-iommu.h +++ b/drivers/iommu/dma-iommu.h @@ -13,6 +13,7 @@ void iommu_setup_dma_ops(struct device *dev); int iommu_get_dma_cookie(struct iommu_domain *domain); void iommu_put_dma_cookie(struct iommu_domain *domain); +void iommu_put_msi_cookie(struct iommu_domain *domain); int iommu_dma_init_fq(struct iommu_domain *domain); @@ -40,6 +41,10 @@ static inline void iommu_put_dma_cookie(struct iommu_domain *domain) { } +static inline void iommu_put_msi_cookie(struct iommu_domain *domain) +{ +} + static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) { } diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 503c5d23c1ea2..ab18bc494eefd 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -310,6 +310,7 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, } domain->type = IOMMU_DOMAIN_SVA; + domain->cookie_type = IOMMU_COOKIE_SVA; mmgrab(mm); domain->mm = mm; domain->owner = ops; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index a95a92d82de06..1be6450231753 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2130,8 +2130,10 @@ void iommu_set_fault_handler(struct iommu_domain *domain, iommu_fault_handler_t handler, void *token) { - BUG_ON(!domain); + if (WARN_ON(!domain || domain->cookie_type != IOMMU_COOKIE_NONE)) + return; + domain->cookie_type = IOMMU_COOKIE_FAULT_HANDLER; domain->handler = handler; domain->handler_token = token; } @@ -2201,9 +2203,19 @@ EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc_flags); void iommu_domain_free(struct iommu_domain *domain) { - if (domain->type == IOMMU_DOMAIN_SVA) + switch (domain->cookie_type) { + case IOMMU_COOKIE_DMA_IOVA: + iommu_put_dma_cookie(domain); + break; + case IOMMU_COOKIE_DMA_MSI: + iommu_put_msi_cookie(domain); + break; + case IOMMU_COOKIE_SVA: mmdrop(domain->mm); - iommu_put_dma_cookie(domain); + break; + default: + break; + } if (domain->ops->free) domain->ops->free(domain); } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 9a89f3a28dc52..fded3f07bfa7b 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -160,6 +160,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, } } hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* @@ -257,6 +258,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, } hwpt->domain->owner = ops; hwpt->domain->iommufd_hwpt = hwpt; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { @@ -315,6 +317,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, } hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->owner = viommu->iommu_dev->ops; + hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index a8d6a095d6540..561f9898f1d76 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -41,6 +41,7 @@ struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_dma_cookie; +struct iommu_dma_msi_cookie; struct iommu_fault_param; struct iommufd_ctx; struct iommufd_viommu; @@ -165,6 +166,15 @@ struct iommu_domain_geometry { bool force_aperture; /* DMA only allowed in mappable range? */ }; +enum iommu_domain_cookie_type { + IOMMU_COOKIE_NONE, + IOMMU_COOKIE_DMA_IOVA, + IOMMU_COOKIE_DMA_MSI, + IOMMU_COOKIE_FAULT_HANDLER, + IOMMU_COOKIE_SVA, + IOMMU_COOKIE_IOMMUFD, +}; + /* Domain feature flags */ #define __IOMMU_DOMAIN_PAGING (1U << 0) /* Support for iommu_map/unmap */ #define __IOMMU_DOMAIN_DMA_API (1U << 1) /* Domain for use in DMA-API @@ -211,12 +221,12 @@ struct iommu_domain_geometry { struct iommu_domain { unsigned type; + enum iommu_domain_cookie_type cookie_type; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; - struct iommu_dma_cookie *iova_cookie; int (*iopf_handler)(struct iopf_group *group); #if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) @@ -224,10 +234,10 @@ struct iommu_domain { phys_addr_t msi_addr); #endif - union { /* Pointer usable by owner of the domain */ - struct iommufd_hw_pagetable *iommufd_hwpt; /* iommufd */ - }; - union { /* Fault handler */ + union { /* cookie */ + struct iommu_dma_cookie *iova_cookie; + struct iommu_dma_msi_cookie *msi_cookie; + struct iommufd_hw_pagetable *iommufd_hwpt; struct { iommu_fault_handler_t handler; void *handler_token; From 677086460c92d03f096f5325e7974f2ed5cacf4d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 24 Mar 2025 21:05:16 -0700 Subject: [PATCH 034/147] iommufd: Move iommufd_sw_msi and related functions to driver.c To provide the iommufd_sw_msi() to the iommu core that is under a different Kconfig, move it and its related functions to driver.c. Then, stub it into the iommu-priv header. The iommufd_sw_msi_install() continues to be used by iommufd internal, so put it in the private header. Note that iommufd_sw_msi() will be called in the iommu core, replacing the sw_msi function pointer. Given that IOMMU_API is "bool" in Kconfig, change IOMMUFD_DRIVER_CORE to "bool" as well. Since this affects the module size, here is before-n-after size comparison: [Before] text data bss dec hex filename 18797 848 56 19701 4cf5 drivers/iommu/iommufd/device.o 722 44 0 766 2fe drivers/iommu/iommufd/driver.o [After] text data bss dec hex filename 17735 808 56 18599 48a7 drivers/iommu/iommufd/device.o 3020 180 0 3200 c80 drivers/iommu/iommufd/driver.o Link: https://patch.msgid.link/r/374c159592dba7852bee20968f3f66fa0ee8ca93.1742871535.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe (cherry picked from commit ec031e1b35ded5acfcef100d9ee7144bbfa4bc12) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu-priv.h | 13 +++ drivers/iommu/iommufd/Kconfig | 2 +- drivers/iommu/iommufd/device.c | 131 ++---------------------- drivers/iommu/iommufd/driver.c | 126 +++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 7 +- 5 files changed, 153 insertions(+), 126 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index 05fa6e682e88d..154e969285179 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -5,6 +5,7 @@ #define __LINUX_IOMMU_PRIV_H #include +#include static inline const struct iommu_ops *dev_iommu_ops(struct device *dev) { @@ -47,4 +48,16 @@ void iommu_detach_group_handle(struct iommu_domain *domain, int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, struct iommu_attach_handle *handle); + +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) && IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); +#else /* !CONFIG_IOMMUFD_DRIVER_CORE || !CONFIG_IRQ_MSI_IOMMU */ +static inline int iommufd_sw_msi(struct iommu_domain *domain, + struct msi_desc *desc, phys_addr_t msi_addr) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_IOMMUFD_DRIVER_CORE && CONFIG_IRQ_MSI_IOMMU */ + #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index 0a07f9449fd9c..2beeb4f60ee53 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config IOMMUFD_DRIVER_CORE - tristate + bool default (IOMMUFD_DRIVER || IOMMUFD) if IOMMUFD!=n config IOMMUFD diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index bd50146e2ad06..d18ea9a61522e 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -5,7 +5,6 @@ #include #include #include -#include #include "../iommu-priv.h" #include "io_pagetable.h" @@ -294,129 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); -/* - * Get a iommufd_sw_msi_map for the msi physical address requested by the irq - * layer. The mapping to IOVA is global to the iommufd file descriptor, every - * domain that is attached to a device using the same MSI parameters will use - * the same IOVA. - */ -static __maybe_unused struct iommufd_sw_msi_map * -iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, - phys_addr_t sw_msi_start) -{ - struct iommufd_sw_msi_map *cur; - unsigned int max_pgoff = 0; - - lockdep_assert_held(&ictx->sw_msi_lock); - - list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { - if (cur->sw_msi_start != sw_msi_start) - continue; - max_pgoff = max(max_pgoff, cur->pgoff + 1); - if (cur->msi_addr == msi_addr) - return cur; - } - - if (ictx->sw_msi_id >= - BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) - return ERR_PTR(-EOVERFLOW); - - cur = kzalloc(sizeof(*cur), GFP_KERNEL); - if (!cur) - return ERR_PTR(-ENOMEM); - - cur->sw_msi_start = sw_msi_start; - cur->msi_addr = msi_addr; - cur->pgoff = max_pgoff; - cur->id = ictx->sw_msi_id++; - list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); - return cur; -} - -static int iommufd_sw_msi_install(struct iommufd_ctx *ictx, - struct iommufd_hwpt_paging *hwpt_paging, - struct iommufd_sw_msi_map *msi_map) -{ - unsigned long iova; - - lockdep_assert_held(&ictx->sw_msi_lock); - - iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; - if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { - int rc; - - rc = iommu_map(hwpt_paging->common.domain, iova, - msi_map->msi_addr, PAGE_SIZE, - IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, - GFP_KERNEL_ACCOUNT); - if (rc) - return rc; - __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); - } - return 0; -} - -/* - * Called by the irq code if the platform translates the MSI address through the - * IOMMU. msi_addr is the physical address of the MSI page. iommufd will - * allocate a fd global iova for the physical page that is the same on all - * domains and devices. - */ #ifdef CONFIG_IRQ_MSI_IOMMU -int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr) -{ - struct device *dev = msi_desc_to_dev(desc); - struct iommufd_hwpt_paging *hwpt_paging; - struct iommu_attach_handle *raw_handle; - struct iommufd_attach_handle *handle; - struct iommufd_sw_msi_map *msi_map; - struct iommufd_ctx *ictx; - unsigned long iova; - int rc; - - /* - * It is safe to call iommu_attach_handle_get() here because the iommu - * core code invokes this under the group mutex which also prevents any - * change of the attach handle for the duration of this function. - */ - iommu_group_mutex_assert(dev); - - raw_handle = - iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); - if (IS_ERR(raw_handle)) - return 0; - hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); - - handle = to_iommufd_handle(raw_handle); - /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ - if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) - return 0; - - ictx = handle->idev->ictx; - guard(mutex)(&ictx->sw_msi_lock); - /* - * The input msi_addr is the exact byte offset of the MSI doorbell, we - * assume the caller has checked that it is contained with a MMIO region - * that is secure to map at PAGE_SIZE. - */ - msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, - msi_addr & PAGE_MASK, - handle->idev->igroup->sw_msi_start); - if (IS_ERR(msi_map)) - return PTR_ERR(msi_map); - - rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); - if (rc) - return rc; - __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); - - iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; - msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); - return 0; -} -#endif - static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { @@ -443,6 +320,14 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, } return 0; } +#else +static inline int +iommufd_group_setup_msi(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + return 0; +} +#endif static int iommufd_device_attach_reserved_iova(struct iommufd_device *idev, diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 75b365561c161..a08ff0f37fc6d 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -121,5 +121,131 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu, } EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_event, "IOMMUFD"); +#ifdef CONFIG_IRQ_MSI_IOMMU +/* + * Get a iommufd_sw_msi_map for the msi physical address requested by the irq + * layer. The mapping to IOVA is global to the iommufd file descriptor, every + * domain that is attached to a device using the same MSI parameters will use + * the same IOVA. + */ +static struct iommufd_sw_msi_map * +iommufd_sw_msi_get_map(struct iommufd_ctx *ictx, phys_addr_t msi_addr, + phys_addr_t sw_msi_start) +{ + struct iommufd_sw_msi_map *cur; + unsigned int max_pgoff = 0; + + lockdep_assert_held(&ictx->sw_msi_lock); + + list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) { + if (cur->sw_msi_start != sw_msi_start) + continue; + max_pgoff = max(max_pgoff, cur->pgoff + 1); + if (cur->msi_addr == msi_addr) + return cur; + } + + if (ictx->sw_msi_id >= + BITS_PER_BYTE * sizeof_field(struct iommufd_sw_msi_maps, bitmap)) + return ERR_PTR(-EOVERFLOW); + + cur = kzalloc(sizeof(*cur), GFP_KERNEL); + if (!cur) + return ERR_PTR(-ENOMEM); + + cur->sw_msi_start = sw_msi_start; + cur->msi_addr = msi_addr; + cur->pgoff = max_pgoff; + cur->id = ictx->sw_msi_id++; + list_add_tail(&cur->sw_msi_item, &ictx->sw_msi_list); + return cur; +} + +int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map) +{ + unsigned long iova; + + lockdep_assert_held(&ictx->sw_msi_lock); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + if (!test_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap)) { + int rc; + + rc = iommu_map(hwpt_paging->common.domain, iova, + msi_map->msi_addr, PAGE_SIZE, + IOMMU_WRITE | IOMMU_READ | IOMMU_MMIO, + GFP_KERNEL_ACCOUNT); + if (rc) + return rc; + __set_bit(msi_map->id, hwpt_paging->present_sw_msi.bitmap); + } + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi_install, "IOMMUFD_INTERNAL"); + +/* + * Called by the irq code if the platform translates the MSI address through the + * IOMMU. msi_addr is the physical address of the MSI page. iommufd will + * allocate a fd global iova for the physical page that is the same on all + * domains and devices. + */ +int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) +{ + struct device *dev = msi_desc_to_dev(desc); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommu_attach_handle *raw_handle; + struct iommufd_attach_handle *handle; + struct iommufd_sw_msi_map *msi_map; + struct iommufd_ctx *ictx; + unsigned long iova; + int rc; + + /* + * It is safe to call iommu_attach_handle_get() here because the iommu + * core code invokes this under the group mutex which also prevents any + * change of the attach handle for the duration of this function. + */ + iommu_group_mutex_assert(dev); + + raw_handle = + iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0); + if (IS_ERR(raw_handle)) + return 0; + hwpt_paging = find_hwpt_paging(domain->iommufd_hwpt); + + handle = to_iommufd_handle(raw_handle); + /* No IOMMU_RESV_SW_MSI means no change to the msi_msg */ + if (handle->idev->igroup->sw_msi_start == PHYS_ADDR_MAX) + return 0; + + ictx = handle->idev->ictx; + guard(mutex)(&ictx->sw_msi_lock); + /* + * The input msi_addr is the exact byte offset of the MSI doorbell, we + * assume the caller has checked that it is contained with a MMIO region + * that is secure to map at PAGE_SIZE. + */ + msi_map = iommufd_sw_msi_get_map(handle->idev->ictx, + msi_addr & PAGE_MASK, + handle->idev->igroup->sw_msi_start); + if (IS_ERR(msi_map)) + return PTR_ERR(msi_map); + + rc = iommufd_sw_msi_install(ictx, hwpt_paging, msi_map); + if (rc) + return rc; + __set_bit(msi_map->id, handle->idev->igroup->required_sw_msi.bitmap); + + iova = msi_map->sw_msi_start + msi_map->pgoff * PAGE_SIZE; + msi_desc_set_iommu_msi_iova(desc, iova, PAGE_SHIFT); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_sw_msi, "IOMMUFD"); +#endif + MODULE_DESCRIPTION("iommufd code shared with builtin modules"); +MODULE_IMPORT_NS("IOMMUFD_INTERNAL"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8cda9c4672ebf..8c49ca16919a2 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -32,8 +32,11 @@ struct iommufd_sw_msi_maps { DECLARE_BITMAP(bitmap, 64); }; -int iommufd_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr); +#ifdef CONFIG_IRQ_MSI_IOMMU +int iommufd_sw_msi_install(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_sw_msi_map *msi_map); +#endif struct iommufd_ctx { struct file *file; From 7f1cb5957f7d980e358f84221a6caf5b2baa38a1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 24 Mar 2025 21:05:17 -0700 Subject: [PATCH 035/147] iommu: Drop sw_msi from iommu_domain There are only two sw_msi implementations in the entire system, thus it's not very necessary to have an sw_msi pointer. Instead, check domain->cookie_type to call the two sw_msi implementations directly from the core code. Link: https://patch.msgid.link/r/7ded87c871afcbaac665b71354de0a335087bf0f.1742871535.git.nicolinc@nvidia.com Suggested-by: Robin Murphy Reviewed-by: Robin Murphy Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe (cherry picked from commit 06d54f00f3f5a29cbf43410ac93ee2dd89e3b711) Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 14 ++------------ drivers/iommu/dma-iommu.h | 9 +++++++++ drivers/iommu/iommu.c | 18 ++++++++++++++++-- drivers/iommu/iommufd/hw_pagetable.c | 3 --- include/linux/iommu.h | 15 --------------- 5 files changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 31a7b4b816563..2bd9f80a83fe4 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -94,9 +94,6 @@ static int __init iommu_dma_forcedac_setup(char *str) } early_param("iommu.forcedac", iommu_dma_forcedac_setup); -static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr); - /* Number of entries per flush queue */ #define IOVA_DEFAULT_FQ_SIZE 256 #define IOVA_SINGLE_FQ_SIZE 32768 @@ -377,7 +374,6 @@ int iommu_get_dma_cookie(struct iommu_domain *domain) mutex_init(&cookie->mutex); INIT_LIST_HEAD(&cookie->msi_page_list); - iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); domain->cookie_type = IOMMU_COOKIE_DMA_IOVA; domain->iova_cookie = cookie; return 0; @@ -411,7 +407,6 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base) cookie->msi_iova = base; INIT_LIST_HEAD(&cookie->msi_page_list); - iommu_domain_set_sw_msi(domain, iommu_dma_sw_msi); domain->cookie_type = IOMMU_COOKIE_DMA_MSI; domain->msi_cookie = cookie; return 0; @@ -427,11 +422,6 @@ void iommu_put_dma_cookie(struct iommu_domain *domain) struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iommu_dma_msi_page *msi, *tmp; -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - if (domain->sw_msi != iommu_dma_sw_msi) - return; -#endif - if (cookie->iovad.granule) { iommu_dma_free_fq(cookie); put_iova_domain(&cookie->iovad); @@ -1826,8 +1816,8 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return NULL; } -static int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr) +int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr) { struct device *dev = msi_desc_to_dev(desc); const struct iommu_dma_msi_page *msi_page; diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h index 9cca11806e5d0..eca201c1f9639 100644 --- a/drivers/iommu/dma-iommu.h +++ b/drivers/iommu/dma-iommu.h @@ -19,6 +19,9 @@ int iommu_dma_init_fq(struct iommu_domain *domain); void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); +int iommu_dma_sw_msi(struct iommu_domain *domain, struct msi_desc *desc, + phys_addr_t msi_addr); + extern bool iommu_dma_forcedac; #else /* CONFIG_IOMMU_DMA */ @@ -49,5 +52,11 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he { } +static inline int iommu_dma_sw_msi(struct iommu_domain *domain, + struct msi_desc *desc, phys_addr_t msi_addr) +{ + return -ENODEV; +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __DMA_IOMMU_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 1be6450231753..2f74dcc67fa33 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3842,8 +3843,21 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) return 0; mutex_lock(&group->mutex); - if (group->domain && group->domain->sw_msi) - ret = group->domain->sw_msi(group->domain, desc, msi_addr); + /* An IDENTITY domain must pass through */ + if (group->domain && group->domain->type != IOMMU_DOMAIN_IDENTITY) { + switch (group->domain->cookie_type) { + case IOMMU_COOKIE_DMA_MSI: + case IOMMU_COOKIE_DMA_IOVA: + ret = iommu_dma_sw_msi(group->domain, desc, msi_addr); + break; + case IOMMU_COOKIE_IOMMUFD: + ret = iommufd_sw_msi(group->domain, desc, msi_addr); + break; + default: + ret = -EOPNOTSUPP; + break; + } + } mutex_unlock(&group->mutex); return ret; } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index fded3f07bfa7b..8e87ae71e1284 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -161,7 +161,6 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, } hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; - iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); /* * Set the coherency mode before we do iopt_table_add_domain() as some @@ -259,7 +258,6 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, hwpt->domain->owner = ops; hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; - iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; @@ -318,7 +316,6 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, hwpt->domain->iommufd_hwpt = hwpt; hwpt->domain->owner = viommu->iommu_dev->ops; hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; - iommu_domain_set_sw_msi(hwpt->domain, iommufd_sw_msi); if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { rc = -EINVAL; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 561f9898f1d76..3a8d35d41fdad 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -229,11 +229,6 @@ struct iommu_domain { struct iommu_domain_geometry geometry; int (*iopf_handler)(struct iopf_group *group); -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr); -#endif - union { /* cookie */ struct iommu_dma_cookie *iova_cookie; struct iommu_dma_msi_cookie *msi_cookie; @@ -254,16 +249,6 @@ struct iommu_domain { }; }; -static inline void iommu_domain_set_sw_msi( - struct iommu_domain *domain, - int (*sw_msi)(struct iommu_domain *domain, struct msi_desc *desc, - phys_addr_t msi_addr)) -{ -#if IS_ENABLED(CONFIG_IRQ_MSI_IOMMU) - domain->sw_msi = sw_msi; -#endif -} - static inline bool iommu_is_dma_domain(struct iommu_domain *domain) { return domain->type & __IOMMU_DOMAIN_DMA_API; From fb837eb6f286100864b91bbf7df2def15f5dda6d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:23 -0700 Subject: [PATCH 036/147] iommu: Require passing new handles to APIs supporting handle Add kdoc to highligt the caller of iommu_[attach|replace]_group_handle() and iommu_attach_device_pasid() should always provide a new handle. This can avoid race with lockless reference to the handle. e.g. the find_fault_handler() and iommu_report_device_fault() in the PRI path. Link: https://patch.msgid.link/r/20250321171940.7213-2-yi.l.liu@intel.com Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit ada14b9f1aab5b60d50dec14c17eb84a55c0f682) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 2f74dcc67fa33..3ebab591f8005 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3551,6 +3551,9 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, * @pasid: the pasid of the device. * @handle: the attach handle. * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle if it intends to pass a valid handle. + * * Return: 0 on success, or an error. */ int iommu_attach_device_pasid(struct iommu_domain *domain, @@ -3717,6 +3720,9 @@ EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, "IOMMUFD_INTERNAL"); * This is a variant of iommu_attach_group(). It allows the caller to provide * an attach handle and use it when the domain is attached. This is currently * used by IOMMUFD to deliver the I/O page faults. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle. */ int iommu_attach_group_handle(struct iommu_domain *domain, struct iommu_group *group, @@ -3786,6 +3792,9 @@ EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, "IOMMUFD_INTERNAL"); * * If the currently attached domain is a core domain (e.g. a default_domain), * it will act just like the iommu_attach_group_handle(). + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle. */ int iommu_replace_group_handle(struct iommu_group *group, struct iommu_domain *new_domain, From 436ebadc9d6f2ab7252a0bce35509c463bc9db16 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Tue, 5 Aug 2025 04:26:34 -0700 Subject: [PATCH 037/147] NVIDIA: SAUCE: Revert "iommu: Skip PASID validation for devices without PASID capability" This reverts commit 5c14fd0850d6191efff4c7ac34acb5a463397080. Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 3ebab591f8005..5a6fd62b3667a 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3511,12 +3511,10 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, int ret; for_each_group_device(group, device) { - if (device->dev->iommu->max_pasids > 0) { - ret = domain->ops->set_dev_pasid(domain, device->dev, - pasid, NULL); - if (ret) - goto err_revert; - } + ret = domain->ops->set_dev_pasid(domain, device->dev, + pasid, NULL); + if (ret) + goto err_revert; } return 0; @@ -3526,8 +3524,7 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, for_each_group_device(group, device) { if (device == last_gdev) break; - if (device->dev->iommu->max_pasids > 0) - iommu_remove_dev_pasid(device->dev, pasid, domain); + iommu_remove_dev_pasid(device->dev, pasid, domain); } return ret; } @@ -3538,10 +3535,8 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, { struct group_device *device; - for_each_group_device(group, device) { - if (device->dev->iommu->max_pasids > 0) - iommu_remove_dev_pasid(device->dev, pasid, domain); - } + for_each_group_device(group, device) + iommu_remove_dev_pasid(device->dev, pasid, domain); } /* @@ -3582,13 +3577,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); for_each_group_device(group, device) { - /* - * Skip PASID validation for devices without PASID support - * (max_pasids = 0). These devices cannot issue transactions - * with PASID, so they don't affect group's PASID usage. - */ - if ((device->dev->iommu->max_pasids > 0) && - (pasid >= device->dev->iommu->max_pasids)) { + if (pasid >= device->dev->iommu->max_pasids) { ret = -EINVAL; goto out_unlock; } From af3e847af0fee1137533a3e61260d3d915828e0d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:24 -0700 Subject: [PATCH 038/147] iommu: Introduce a replace API for device pasid Provide a high-level API to allow replacements of one domain with another for specific pasid of a device. This is similar to iommu_replace_group_handle() and it is expected to be used only by IOMMUFD. Link: https://patch.msgid.link/r/20250321171940.7213-3-yi.l.liu@intel.com Co-developed-by: Lu Baolu Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 8a9e1e773f60080f6d56bd997719d4e62048b2d3) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu-priv.h | 3 + drivers/iommu/iommu.c | 115 +++++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h index 154e969285179..e236b932e7668 100644 --- a/drivers/iommu/iommu-priv.h +++ b/drivers/iommu/iommu-priv.h @@ -60,4 +60,7 @@ static inline int iommufd_sw_msi(struct iommu_domain *domain, } #endif /* CONFIG_IOMMUFD_DRIVER_CORE && CONFIG_IRQ_MSI_IOMMU */ +int iommu_replace_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle); #endif /* __LINUX_IOMMU_PRIV_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5a6fd62b3667a..78c64dc924537 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -525,6 +525,13 @@ static void iommu_deinit_device(struct device *dev) #endif } +static struct iommu_domain *pasid_array_entry_to_domain(void *entry) +{ + if (xa_pointer_tag(entry) == IOMMU_PASID_ARRAY_DOMAIN) + return xa_untag_pointer(entry); + return ((struct iommu_attach_handle *)xa_untag_pointer(entry))->domain; +} + DEFINE_MUTEX(iommu_probe_device_lock); static int __iommu_probe_device(struct device *dev, struct list_head *group_list) @@ -3505,14 +3512,15 @@ static void iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid, } static int __iommu_set_group_pasid(struct iommu_domain *domain, - struct iommu_group *group, ioasid_t pasid) + struct iommu_group *group, ioasid_t pasid, + struct iommu_domain *old) { struct group_device *device, *last_gdev; int ret; for_each_group_device(group, device) { ret = domain->ops->set_dev_pasid(domain, device->dev, - pasid, NULL); + pasid, old); if (ret) goto err_revert; } @@ -3524,7 +3532,15 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, for_each_group_device(group, device) { if (device == last_gdev) break; - iommu_remove_dev_pasid(device->dev, pasid, domain); + /* + * If no old domain, undo the succeeded devices/pasid. + * Otherwise, rollback the succeeded devices/pasid to the old + * domain. And it is a driver bug to fail attaching with a + * previously good domain. + */ + if (!old || WARN_ON(old->ops->set_dev_pasid(old, device->dev, + pasid, domain))) + iommu_remove_dev_pasid(device->dev, pasid, domain); } return ret; } @@ -3593,7 +3609,7 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, if (ret) goto out_unlock; - ret = __iommu_set_group_pasid(domain, group, pasid); + ret = __iommu_set_group_pasid(domain, group, pasid, NULL); if (ret) { xa_release(&group->pasid_array, pasid); goto out_unlock; @@ -3614,6 +3630,97 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, } EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); +/** + * iommu_replace_device_pasid - Replace the domain that a specific pasid + * of the device is attached to + * @domain: the new iommu domain + * @dev: the attached device. + * @pasid: the pasid of the device. + * @handle: the attach handle. + * + * This API allows the pasid to switch domains. The @pasid should have been + * attached. Otherwise, this fails. The pasid will keep the old configuration + * if replacement failed. + * + * Caller should always provide a new handle to avoid race with the paths + * that have lockless reference to handle if it intends to pass a valid handle. + * + * Return 0 on success, or an error. + */ +int iommu_replace_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_attach_handle *handle) +{ + /* Caller must be a probed driver on dev */ + struct iommu_group *group = dev->iommu_group; + struct iommu_attach_handle *entry; + struct iommu_domain *curr_domain; + void *curr; + int ret; + + if (!group) + return -ENODEV; + + if (!domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + if (dev_iommu_ops(dev) != domain->owner || + pasid == IOMMU_NO_PASID || !handle) + return -EINVAL; + + mutex_lock(&group->mutex); + entry = iommu_make_pasid_array_entry(domain, handle); + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, + XA_ZERO_ENTRY, GFP_KERNEL); + if (xa_is_err(curr)) { + ret = xa_err(curr); + goto out_unlock; + } + + /* + * No domain (with or without handle) attached, hence not + * a replace case. + */ + if (!curr) { + xa_release(&group->pasid_array, pasid); + ret = -EINVAL; + goto out_unlock; + } + + /* + * Reusing handle is problematic as there are paths that refers + * the handle without lock. To avoid race, reject the callers that + * attempt it. + */ + if (curr == entry) { + WARN_ON(1); + ret = -EINVAL; + goto out_unlock; + } + + curr_domain = pasid_array_entry_to_domain(curr); + ret = 0; + + if (curr_domain != domain) { + ret = __iommu_set_group_pasid(domain, group, + pasid, curr_domain); + if (ret) + goto out_unlock; + } + + /* + * The above xa_cmpxchg() reserved the memory, and the + * group->mutex is held, this cannot fail. + */ + WARN_ON(xa_is_err(xa_store(&group->pasid_array, + pasid, entry, GFP_KERNEL))); + +out_unlock: + mutex_unlock(&group->mutex); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommu_replace_device_pasid, "IOMMUFD_INTERNAL"); + /* * iommu_detach_device_pasid() - Detach the domain from pasid of device * @domain: the iommu domain. From 67af2b88f7f933f4e5337ab64051bc71bf0ba346 Mon Sep 17 00:00:00 2001 From: Tushar Dave Date: Mon, 19 May 2025 18:19:37 -0700 Subject: [PATCH 039/147] iommu: Skip PASID validation for devices without PASID capability Generally PASID support requires ACS settings that usually create single device groups, but there are some niche cases where we can get multi-device groups and still have working PASID support. The primary issue is that PCI switches are not required to treat PASID tagged TLPs specially so appropriate ACS settings are required to route all TLPs to the host bridge if PASID is going to work properly. pci_enable_pasid() does check that each device that will use PASID has the proper ACS settings to achieve this routing. However, no-PASID devices can be combined with PASID capable devices within the same topology using non-uniform ACS settings. In this case the no-PASID devices may not have strict route to host ACS flags and end up being grouped with the PASID devices. This configuration fails to allow use of the PASID within the iommu core code which wrongly checks if the no-PASID device supports PASID. Fix this by ignoring no-PASID devices during the PASID validation. They will never issue a PASID TLP anyhow so they can be ignored. Fixes: c404f55c26fc ("iommu: Validate the PASID in iommu_attach_device_pasid()") Cc: stable@vger.kernel.org Signed-off-by: Tushar Dave Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Link: https://lore.kernel.org/r/20250520011937.3230557-1-tdave@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit b3f6fcd8404f9f92262303369bb877ec5d188a81) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 78c64dc924537..5cbcc95f7ba88 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -3519,10 +3519,12 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, int ret; for_each_group_device(group, device) { - ret = domain->ops->set_dev_pasid(domain, device->dev, - pasid, old); - if (ret) - goto err_revert; + if (device->dev->iommu->max_pasids > 0) { + ret = domain->ops->set_dev_pasid(domain, device->dev, + pasid, old); + if (ret) + goto err_revert; + } } return 0; @@ -3532,15 +3534,18 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain, for_each_group_device(group, device) { if (device == last_gdev) break; - /* - * If no old domain, undo the succeeded devices/pasid. - * Otherwise, rollback the succeeded devices/pasid to the old - * domain. And it is a driver bug to fail attaching with a - * previously good domain. - */ - if (!old || WARN_ON(old->ops->set_dev_pasid(old, device->dev, + if (device->dev->iommu->max_pasids > 0) { + /* + * If no old domain, undo the succeeded devices/pasid. + * Otherwise, rollback the succeeded devices/pasid to + * the old domain. And it is a driver bug to fail + * attaching with a previously good domain. + */ + if (!old || + WARN_ON(old->ops->set_dev_pasid(old, device->dev, pasid, domain))) - iommu_remove_dev_pasid(device->dev, pasid, domain); + iommu_remove_dev_pasid(device->dev, pasid, domain); + } } return ret; } @@ -3551,8 +3556,10 @@ static void __iommu_remove_group_pasid(struct iommu_group *group, { struct group_device *device; - for_each_group_device(group, device) - iommu_remove_dev_pasid(device->dev, pasid, domain); + for_each_group_device(group, device) { + if (device->dev->iommu->max_pasids > 0) + iommu_remove_dev_pasid(device->dev, pasid, domain); + } } /* @@ -3593,7 +3600,13 @@ int iommu_attach_device_pasid(struct iommu_domain *domain, mutex_lock(&group->mutex); for_each_group_device(group, device) { - if (pasid >= device->dev->iommu->max_pasids) { + /* + * Skip PASID validation for devices without PASID support + * (max_pasids = 0). These devices cannot issue transactions + * with PASID, so they don't affect group's PASID usage. + */ + if ((device->dev->iommu->max_pasids > 0) && + (pasid >= device->dev->iommu->max_pasids)) { ret = -EINVAL; goto out_unlock; } From f9d08442ada5067560748fb615c16c3daac35742 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:25 -0700 Subject: [PATCH 040/147] iommufd: Pass @pasid through the device attach/replace path Most of the core logic before conducting the actual device attach/ replace operation can be shared with pasid attach/replace. So pass @pasid through the device attach/replace helpers to prepare adding pasid attach/replace. So far the @pasid should only be IOMMU_NO_PASID. No functional change. Link: https://patch.msgid.link/r/20250321171940.7213-4-yi.l.liu@intel.com Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 03c9b102bea6f4f0b517c841fe1d2f9c616c95b9) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 70 +++++++++++++++---------- drivers/iommu/iommufd/hw_pagetable.c | 13 ++--- drivers/iommu/iommufd/iommufd_private.h | 8 +-- 3 files changed, 52 insertions(+), 39 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index d18ea9a61522e..7051feda2fab0 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -368,7 +368,8 @@ static bool iommufd_device_is_attached(struct iommufd_device *idev) } static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, + ioasid_t pasid) { struct iommufd_attach_handle *handle; int rc; @@ -386,6 +387,7 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, } handle->idev = idev; + WARN_ON(pasid != IOMMU_NO_PASID); rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, &handle->handle); if (rc) @@ -402,25 +404,28 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, } static struct iommufd_attach_handle * -iommufd_device_get_attach_handle(struct iommufd_device *idev) +iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid) { struct iommu_attach_handle *handle; lockdep_assert_held(&idev->igroup->lock); handle = - iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0); + iommu_attach_handle_get(idev->igroup->group, pasid, 0); if (IS_ERR(handle)) return NULL; return to_iommufd_handle(handle); } static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, + ioasid_t pasid) { struct iommufd_attach_handle *handle; - handle = iommufd_device_get_attach_handle(idev); + WARN_ON(pasid != IOMMU_NO_PASID); + + handle = iommufd_device_get_attach_handle(idev, pasid); iommu_detach_group_handle(hwpt->domain, idev->igroup->group); if (hwpt->fault) { iommufd_auto_response_faults(hwpt, handle); @@ -430,13 +435,17 @@ static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, } static int iommufd_hwpt_replace_device(struct iommufd_device *idev, + ioasid_t pasid, struct iommufd_hw_pagetable *hwpt, struct iommufd_hw_pagetable *old) { - struct iommufd_attach_handle *handle, *old_handle = - iommufd_device_get_attach_handle(idev); + struct iommufd_attach_handle *handle, *old_handle; int rc; + WARN_ON(pasid != IOMMU_NO_PASID); + + old_handle = iommufd_device_get_attach_handle(idev, pasid); + handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) return -ENOMEM; @@ -471,7 +480,7 @@ static int iommufd_hwpt_replace_device(struct iommufd_device *idev, } int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev) + struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); int rc; @@ -497,7 +506,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_hwpt_attach_device(hwpt, idev); + rc = iommufd_hwpt_attach_device(hwpt, idev, pasid); if (rc) goto err_unresv; idev->igroup->hwpt = hwpt; @@ -515,7 +524,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, } struct iommufd_hw_pagetable * -iommufd_hw_pagetable_detach(struct iommufd_device *idev) +iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); @@ -523,7 +532,7 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) mutex_lock(&idev->igroup->lock); list_del(&idev->group_item); if (list_empty(&idev->igroup->device_list)) { - iommufd_hwpt_detach_device(hwpt, idev); + iommufd_hwpt_detach_device(hwpt, idev, pasid); idev->igroup->hwpt = NULL; } if (hwpt_paging) @@ -535,12 +544,12 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) } static struct iommufd_hw_pagetable * -iommufd_device_do_attach(struct iommufd_device *idev, +iommufd_device_do_attach(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { int rc; - rc = iommufd_hw_pagetable_attach(hwpt, idev); + rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) return ERR_PTR(rc); return NULL; @@ -589,7 +598,7 @@ iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, } static struct iommufd_hw_pagetable * -iommufd_device_do_replace(struct iommufd_device *idev, +iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); @@ -623,7 +632,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, goto err_unlock; } - rc = iommufd_hwpt_replace_device(idev, hwpt, old_hwpt); + rc = iommufd_hwpt_replace_device(idev, pasid, hwpt, old_hwpt); if (rc) goto err_unresv; @@ -656,7 +665,8 @@ iommufd_device_do_replace(struct iommufd_device *idev, } typedef struct iommufd_hw_pagetable *(*attach_fn)( - struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt); + struct iommufd_device *idev, ioasid_t pasid, + struct iommufd_hw_pagetable *hwpt); /* * When automatically managing the domains we search for a compatible domain in @@ -664,7 +674,7 @@ typedef struct iommufd_hw_pagetable *(*attach_fn)( * Automatic domain selection will never pick a manually created domain. */ static struct iommufd_hw_pagetable * -iommufd_device_auto_get_domain(struct iommufd_device *idev, +iommufd_device_auto_get_domain(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_ioas *ioas, u32 *pt_id, attach_fn do_attach) { @@ -693,7 +703,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) { iommufd_put_object(idev->ictx, &hwpt->obj); /* @@ -711,8 +721,8 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } - hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, - immediate_attach, NULL); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, pasid, + 0, immediate_attach, NULL); if (IS_ERR(hwpt_paging)) { destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; @@ -720,7 +730,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, hwpt = &hwpt_paging->common; if (!immediate_attach) { - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) goto out_abort; } else { @@ -741,8 +751,9 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, return destroy_hwpt; } -static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, - attach_fn do_attach) +static int iommufd_device_change_pt(struct iommufd_device *idev, + ioasid_t pasid, + u32 *pt_id, attach_fn do_attach) { struct iommufd_hw_pagetable *destroy_hwpt; struct iommufd_object *pt_obj; @@ -757,7 +768,7 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); - destroy_hwpt = (*do_attach)(idev, hwpt); + destroy_hwpt = (*do_attach)(idev, pasid, hwpt); if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; @@ -766,8 +777,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, struct iommufd_ioas *ioas = container_of(pt_obj, struct iommufd_ioas, obj); - destroy_hwpt = iommufd_device_auto_get_domain(idev, ioas, pt_id, - do_attach); + destroy_hwpt = iommufd_device_auto_get_domain(idev, pasid, ioas, + pt_id, do_attach); if (IS_ERR(destroy_hwpt)) goto out_put_pt_obj; break; @@ -804,7 +815,8 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) { int rc; - rc = iommufd_device_change_pt(idev, pt_id, &iommufd_device_do_attach); + rc = iommufd_device_change_pt(idev, IOMMU_NO_PASID, pt_id, + &iommufd_device_do_attach); if (rc) return rc; @@ -834,7 +846,7 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); */ int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) { - return iommufd_device_change_pt(idev, pt_id, + return iommufd_device_change_pt(idev, IOMMU_NO_PASID, pt_id, &iommufd_device_do_replace); } EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD"); @@ -850,7 +862,7 @@ void iommufd_device_detach(struct iommufd_device *idev) { struct iommufd_hw_pagetable *hwpt; - hwpt = iommufd_hw_pagetable_detach(idev); + hwpt = iommufd_hw_pagetable_detach(idev, IOMMU_NO_PASID); iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 8e87ae71e1284..bd9dd26a52950 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -90,6 +90,7 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for + * @pasid: PASID to get an iommu_domain for * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt * @user_data: The user provided driver specific data describing the domain to @@ -105,8 +106,8 @@ iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) */ struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach, + struct iommufd_device *idev, ioasid_t pasid, + u32 flags, bool immediate_attach, const struct iommu_user_data *user_data) { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | @@ -189,7 +190,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * sequence. Once those drivers are fixed this should be removed. */ if (immediate_attach) { - rc = iommufd_hw_pagetable_attach(hwpt, idev); + rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid); if (rc) goto out_abort; } @@ -202,7 +203,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, out_detach: if (immediate_attach) - iommufd_hw_pagetable_detach(idev); + iommufd_hw_pagetable_detach(idev, pasid); out_abort: iommufd_object_abort_and_destroy(ictx, &hwpt->obj); return ERR_PTR(rc); @@ -364,8 +365,8 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) ioas = container_of(pt_obj, struct iommufd_ioas, obj); mutex_lock(&ioas->mutex); hwpt_paging = iommufd_hwpt_paging_alloc( - ucmd->ictx, ioas, idev, cmd->flags, false, - user_data.len ? &user_data : NULL); + ucmd->ictx, ioas, idev, IOMMU_NO_PASID, cmd->flags, + false, user_data.len ? &user_data : NULL); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); goto out_unlock; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 8c49ca16919a2..891800948d1a6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -369,13 +369,13 @@ int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, u32 flags, - bool immediate_attach, + struct iommufd_device *idev, ioasid_t pasid, + u32 flags, bool immediate_attach, const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, - struct iommufd_device *idev); + struct iommufd_device *idev, ioasid_t pasid); struct iommufd_hw_pagetable * -iommufd_hw_pagetable_detach(struct iommufd_device *idev); +iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); From f3a7a373f54333debc6a6071051877b9cf4390bd Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:26 -0700 Subject: [PATCH 041/147] iommufd/device: Only add reserved_iova in non-pasid path As the pasid is passed through the attach/replace/detach helpers, it is necessary to ensure only the non-pasid path adds reserved_iova. Link: https://patch.msgid.link/r/20250321171940.7213-5-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit bc06f7f66de404ae6323963361fe4e2f5f71a1e5) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 7051feda2fab0..4625f084f7d0b 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -483,6 +483,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; int rc; mutex_lock(&idev->igroup->lock); @@ -492,7 +493,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - if (hwpt_paging) { + if (attach_resv) { rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging); if (rc) goto err_unlock; @@ -516,7 +517,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&idev->igroup->lock); return 0; err_unresv: - if (hwpt_paging) + if (attach_resv) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); @@ -535,7 +536,7 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) iommufd_hwpt_detach_device(hwpt, idev, pasid); idev->igroup->hwpt = NULL; } - if (hwpt_paging) + if (hwpt_paging && pasid == IOMMU_NO_PASID) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); mutex_unlock(&idev->igroup->lock); @@ -602,6 +603,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hw_pagetable *hwpt) { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; @@ -626,7 +628,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, } old_hwpt = igroup->hwpt; - if (hwpt_paging) { + if (attach_resv) { rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging); if (rc) goto err_unlock; @@ -637,7 +639,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, goto err_unresv; old_hwpt_paging = find_hwpt_paging(old_hwpt); - if (old_hwpt_paging && + if (old_hwpt_paging && pasid == IOMMU_NO_PASID && (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas)) iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging); @@ -657,7 +659,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - if (hwpt_paging) + if (attach_resv) iommufd_group_remove_reserved_iova(igroup, hwpt_paging); err_unlock: mutex_unlock(&idev->igroup->lock); From 3fd529e27de58977138a7fa88528b3d08e4a0f3e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:27 -0700 Subject: [PATCH 042/147] iommufd/device: Replace idev->igroup with local variable With more use of the fields of igroup, use a local vairable instead of using the idev->igroup heavily. No functional change expected. Link: https://patch.msgid.link/r/20250321171940.7213-6-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 2eaa7f845e149370a162bedb0437c9e26229760c) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4625f084f7d0b..15733b316b707 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -333,18 +333,19 @@ static int iommufd_device_attach_reserved_iova(struct iommufd_device *idev, struct iommufd_hwpt_paging *hwpt_paging) { + struct iommufd_group *igroup = idev->igroup; int rc; - lockdep_assert_held(&idev->igroup->lock); + lockdep_assert_held(&igroup->lock); rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); + &igroup->sw_msi_start); if (rc) return rc; - if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); + if (list_empty(&igroup->device_list)) { + rc = iommufd_group_setup_msi(igroup, hwpt_paging); if (rc) { iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); @@ -484,11 +485,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, { struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; + struct iommufd_group *igroup = idev->igroup; int rc; - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); - if (idev->igroup->hwpt != NULL && idev->igroup->hwpt != hwpt) { + if (igroup->hwpt && igroup->hwpt != hwpt) { rc = -EINVAL; goto err_unlock; } @@ -506,39 +508,40 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * reserved regions are only updated during individual device * attachment. */ - if (list_empty(&idev->igroup->device_list)) { + if (list_empty(&igroup->device_list)) { rc = iommufd_hwpt_attach_device(hwpt, idev, pasid); if (rc) goto err_unresv; - idev->igroup->hwpt = hwpt; + igroup->hwpt = hwpt; } refcount_inc(&hwpt->obj.users); - list_add_tail(&idev->group_item, &idev->igroup->device_list); - mutex_unlock(&idev->igroup->lock); + list_add_tail(&idev->group_item, &igroup->device_list); + mutex_unlock(&igroup->lock); return 0; err_unresv: if (attach_resv) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); err_unlock: - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return rc; } struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) { - struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt; + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *hwpt = igroup->hwpt; struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); list_del(&idev->group_item); - if (list_empty(&idev->igroup->device_list)) { + if (list_empty(&igroup->device_list)) { iommufd_hwpt_detach_device(hwpt, idev, pasid); - idev->igroup->hwpt = NULL; + igroup->hwpt = NULL; } if (hwpt_paging && pasid == IOMMU_NO_PASID) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); /* Caller must destroy hwpt */ return hwpt; @@ -610,7 +613,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, unsigned int num_devices; int rc; - mutex_lock(&idev->igroup->lock); + mutex_lock(&igroup->lock); if (igroup->hwpt == NULL) { rc = -EINVAL; @@ -623,7 +626,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, } if (hwpt == igroup->hwpt) { - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return NULL; } @@ -654,7 +657,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, if (num_devices > 1) WARN_ON(refcount_sub_and_test(num_devices - 1, &old_hwpt->obj.users)); - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); /* Caller must destroy old_hwpt */ return old_hwpt; @@ -662,7 +665,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, if (attach_resv) iommufd_group_remove_reserved_iova(igroup, hwpt_paging); err_unlock: - mutex_unlock(&idev->igroup->lock); + mutex_unlock(&igroup->lock); return ERR_PTR(rc); } From dc02e6bdce7214181ce7493bae4b56a2199655af Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:28 -0700 Subject: [PATCH 043/147] iommufd/device: Add helper to detect the first attach of a group The existing code detects the first attach by checking the igroup->device_list. However, the igroup->hwpt can also be used to detect the first attach. In future modifications, it is better to check the igroup->hwpt instead of the device_list. To improve readbility and also prepare for further modifications on this part, this adds a helper for it. Link: https://patch.msgid.link/r/20250321171940.7213-7-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit ba1de6cd41d0654245864640b62ae45a1bc01bcd) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 15733b316b707..2cc3c12d301d6 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -329,6 +329,13 @@ iommufd_group_setup_msi(struct iommufd_group *igroup, } #endif +static bool +iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid) +{ + lockdep_assert_held(&igroup->lock); + return !igroup->hwpt; +} + static int iommufd_device_attach_reserved_iova(struct iommufd_device *idev, struct iommufd_hwpt_paging *hwpt_paging) @@ -344,7 +351,7 @@ iommufd_device_attach_reserved_iova(struct iommufd_device *idev, if (rc) return rc; - if (list_empty(&igroup->device_list)) { + if (iommufd_group_first_attach(igroup, IOMMU_NO_PASID)) { rc = iommufd_group_setup_msi(igroup, hwpt_paging); if (rc) { iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, @@ -508,7 +515,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * reserved regions are only updated during individual device * attachment. */ - if (list_empty(&igroup->device_list)) { + if (iommufd_group_first_attach(igroup, pasid)) { rc = iommufd_hwpt_attach_device(hwpt, idev, pasid); if (rc) goto err_unresv; From 22976825bcb582e5bf417934bf7511c7bcaff25a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:29 -0700 Subject: [PATCH 044/147] iommufd/device: Wrap igroup->hwpt and igroup->device_list into attach struct The igroup->hwpt and igroup->device_list are used to track the hwpt attach of a group in the RID path. While the coming PASID path also needs such tracking. To be prepared, wrap igroup->hwpt and igroup->device_list into attach struct which is allocated per attaching the first device of the group and freed per detaching the last device of the group. Link: https://patch.msgid.link/r/20250321171940.7213-8-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 75f990aef38e930f8b676562c4d4b02c1f5eccfd) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 76 ++++++++++++++++++------- drivers/iommu/iommufd/iommufd_private.h | 5 +- 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 2cc3c12d301d6..6b4764c2d9af6 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -17,12 +17,17 @@ MODULE_PARM_DESC( "Allow IOMMUFD to bind to devices even if the platform cannot isolate " "the MSI interrupt window. Enabling this is a security weakness."); +struct iommufd_attach { + struct iommufd_hw_pagetable *hwpt; + struct list_head device_list; +}; + static void iommufd_group_release(struct kref *kref) { struct iommufd_group *igroup = container_of(kref, struct iommufd_group, ref); - WARN_ON(igroup->hwpt || !list_empty(&igroup->device_list)); + WARN_ON(igroup->attach); xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, NULL, GFP_KERNEL); @@ -89,7 +94,6 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, kref_init(&new_igroup->ref); mutex_init(&new_igroup->lock); - INIT_LIST_HEAD(&new_igroup->device_list); new_igroup->sw_msi_start = PHYS_ADDR_MAX; /* group reference moves into new_igroup */ new_igroup->group = group; @@ -333,7 +337,7 @@ static bool iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid) { lockdep_assert_held(&igroup->lock); - return !igroup->hwpt; + return !igroup->attach; } static int @@ -369,7 +373,7 @@ static bool iommufd_device_is_attached(struct iommufd_device *idev) { struct iommufd_device *cur; - list_for_each_entry(cur, &idev->igroup->device_list, group_item) + list_for_each_entry(cur, &idev->igroup->attach->device_list, group_item) if (cur == idev) return true; return false; @@ -493,19 +497,33 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID; struct iommufd_group *igroup = idev->igroup; + struct iommufd_hw_pagetable *old_hwpt; + struct iommufd_attach *attach; int rc; mutex_lock(&igroup->lock); - if (igroup->hwpt && igroup->hwpt != hwpt) { + attach = igroup->attach; + if (!attach) { + attach = kzalloc(sizeof(*attach), GFP_KERNEL); + if (!attach) { + rc = -ENOMEM; + goto err_unlock; + } + INIT_LIST_HEAD(&attach->device_list); + } + + old_hwpt = attach->hwpt; + + if (old_hwpt && old_hwpt != hwpt) { rc = -EINVAL; - goto err_unlock; + goto err_free_attach; } if (attach_resv) { rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging); if (rc) - goto err_unlock; + goto err_free_attach; } /* @@ -519,15 +537,19 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, rc = iommufd_hwpt_attach_device(hwpt, idev, pasid); if (rc) goto err_unresv; - igroup->hwpt = hwpt; + attach->hwpt = hwpt; + igroup->attach = attach; } refcount_inc(&hwpt->obj.users); - list_add_tail(&idev->group_item, &igroup->device_list); + list_add_tail(&idev->group_item, &attach->device_list); mutex_unlock(&igroup->lock); return 0; err_unresv: if (attach_resv) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); +err_free_attach: + if (iommufd_group_first_attach(igroup, pasid)) + kfree(attach); err_unlock: mutex_unlock(&igroup->lock); return rc; @@ -537,14 +559,20 @@ struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_group *igroup = idev->igroup; - struct iommufd_hw_pagetable *hwpt = igroup->hwpt; - struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt); + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; mutex_lock(&igroup->lock); + attach = igroup->attach; + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + list_del(&idev->group_item); - if (list_empty(&igroup->device_list)) { + if (list_empty(&attach->device_list)) { iommufd_hwpt_detach_device(hwpt, idev, pasid); - igroup->hwpt = NULL; + igroup->attach = NULL; + kfree(attach); } if (hwpt_paging && pasid == IOMMU_NO_PASID) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); @@ -574,7 +602,7 @@ iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, lockdep_assert_held(&igroup->lock); - list_for_each_entry(cur, &igroup->device_list, group_item) + list_for_each_entry(cur, &igroup->attach->device_list, group_item) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); } @@ -588,9 +616,10 @@ iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, lockdep_assert_held(&igroup->lock); - old_hwpt_paging = find_hwpt_paging(igroup->hwpt); + old_hwpt_paging = find_hwpt_paging(igroup->attach->hwpt); if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { + list_for_each_entry(cur, + &igroup->attach->device_list, group_item) { rc = iopt_table_enforce_dev_resv_regions( &hwpt_paging->ioas->iopt, cur->dev, NULL); if (rc) @@ -617,27 +646,32 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; + struct iommufd_attach *attach; unsigned int num_devices; int rc; mutex_lock(&igroup->lock); - if (igroup->hwpt == NULL) { + attach = igroup->attach; + if (!attach) { rc = -EINVAL; goto err_unlock; } + old_hwpt = attach->hwpt; + + WARN_ON(!old_hwpt || list_empty(&attach->device_list)); + if (!iommufd_device_is_attached(idev)) { rc = -EINVAL; goto err_unlock; } - if (hwpt == igroup->hwpt) { + if (hwpt == old_hwpt) { mutex_unlock(&igroup->lock); return NULL; } - old_hwpt = igroup->hwpt; if (attach_resv) { rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging); if (rc) @@ -653,9 +687,9 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas)) iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging); - igroup->hwpt = hwpt; + attach->hwpt = hwpt; - num_devices = list_count_nodes(&igroup->device_list); + num_devices = list_count_nodes(&attach->device_list); /* * Move the refcounts held by the device_list to the new hwpt. Retain a * refcount for this thread as the caller will free it. diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 891800948d1a6..5b4d8962166bd 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -399,13 +399,14 @@ static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, refcount_dec(&hwpt->obj.users); } +struct iommufd_attach; + struct iommufd_group { struct kref ref; struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; - struct iommufd_hw_pagetable *hwpt; - struct list_head device_list; + struct iommufd_attach *attach; struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; From 574bc366c8454ee411c12aa5b3b20c7162862aa3 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:30 -0700 Subject: [PATCH 045/147] iommufd/device: Replace device_list with device_array igroup->attach->device_list is used to track attached device of a group in the RID path. Such tracking is also needed in the PASID path in order to share path with the RID path. While there is only one list_head in the iommufd_device. It cannot work if the device has been attached in both RID path and PASID path. To solve it, replacing the device_list with an xarray. The attached iommufd_device is stored in the entry indexed by the idev->obj.id. Link: https://patch.msgid.link/r/20250321171940.7213-9-yi.l.liu@intel.com Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 831b40f8416cf393faf41b3ae2e877a73aa6baa3) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 58 +++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 6b4764c2d9af6..760917f5d7649 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -19,7 +19,7 @@ MODULE_PARM_DESC( struct iommufd_attach { struct iommufd_hw_pagetable *hwpt; - struct list_head device_list; + struct xarray device_array; }; static void iommufd_group_release(struct kref *kref) @@ -297,6 +297,20 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); +static unsigned int iommufd_group_device_num(struct iommufd_group *igroup) +{ + struct iommufd_device *idev; + unsigned int count = 0; + unsigned long index; + + lockdep_assert_held(&igroup->lock); + + if (igroup->attach) + xa_for_each(&igroup->attach->device_array, index, idev) + count++; + return count; +} + #ifdef CONFIG_IRQ_MSI_IOMMU static int iommufd_group_setup_msi(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) @@ -371,12 +385,7 @@ iommufd_device_attach_reserved_iova(struct iommufd_device *idev, /* Check if idev is attached to igroup->hwpt */ static bool iommufd_device_is_attached(struct iommufd_device *idev) { - struct iommufd_device *cur; - - list_for_each_entry(cur, &idev->igroup->attach->device_list, group_item) - if (cur == idev) - return true; - return false; + return xa_load(&idev->igroup->attach->device_array, idev->obj.id); } static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, @@ -510,20 +519,27 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, rc = -ENOMEM; goto err_unlock; } - INIT_LIST_HEAD(&attach->device_list); + xa_init(&attach->device_array); } old_hwpt = attach->hwpt; + rc = xa_insert(&attach->device_array, idev->obj.id, XA_ZERO_ENTRY, + GFP_KERNEL); + if (rc) { + WARN_ON(rc == -EBUSY && !old_hwpt); + goto err_free_attach; + } + if (old_hwpt && old_hwpt != hwpt) { rc = -EINVAL; - goto err_free_attach; + goto err_release_devid; } if (attach_resv) { rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging); if (rc) - goto err_free_attach; + goto err_release_devid; } /* @@ -541,12 +557,15 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, igroup->attach = attach; } refcount_inc(&hwpt->obj.users); - list_add_tail(&idev->group_item, &attach->device_list); + WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id, + idev, GFP_KERNEL))); mutex_unlock(&igroup->lock); return 0; err_unresv: if (attach_resv) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev); +err_release_devid: + xa_release(&attach->device_array, idev->obj.id); err_free_attach: if (iommufd_group_first_attach(igroup, pasid)) kfree(attach); @@ -568,8 +587,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) hwpt = attach->hwpt; hwpt_paging = find_hwpt_paging(hwpt); - list_del(&idev->group_item); - if (list_empty(&attach->device_list)) { + xa_erase(&attach->device_array, idev->obj.id); + if (xa_empty(&attach->device_array)) { iommufd_hwpt_detach_device(hwpt, idev, pasid); igroup->attach = NULL; kfree(attach); @@ -599,10 +618,11 @@ iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { struct iommufd_device *cur; + unsigned long index; lockdep_assert_held(&igroup->lock); - list_for_each_entry(cur, &igroup->attach->device_list, group_item) + xa_for_each(&igroup->attach->device_array, index, cur) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); } @@ -612,14 +632,14 @@ iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, { struct iommufd_hwpt_paging *old_hwpt_paging; struct iommufd_device *cur; + unsigned long index; int rc; lockdep_assert_held(&igroup->lock); old_hwpt_paging = find_hwpt_paging(igroup->attach->hwpt); if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) { - list_for_each_entry(cur, - &igroup->attach->device_list, group_item) { + xa_for_each(&igroup->attach->device_array, index, cur) { rc = iopt_table_enforce_dev_resv_regions( &hwpt_paging->ioas->iopt, cur->dev, NULL); if (rc) @@ -660,7 +680,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, old_hwpt = attach->hwpt; - WARN_ON(!old_hwpt || list_empty(&attach->device_list)); + WARN_ON(!old_hwpt || xa_empty(&attach->device_array)); if (!iommufd_device_is_attached(idev)) { rc = -EINVAL; @@ -689,9 +709,9 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, attach->hwpt = hwpt; - num_devices = list_count_nodes(&attach->device_list); + num_devices = iommufd_group_device_num(igroup); /* - * Move the refcounts held by the device_list to the new hwpt. Retain a + * Move the refcounts held by the device_array to the new hwpt. Retain a * refcount for this thread as the caller will free it. */ refcount_add(num_devices, &hwpt->obj.users); From a7262bda328e1e55688366eb1f706834b7b4cb5d Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:31 -0700 Subject: [PATCH 046/147] iommufd/device: Add pasid_attach array to track per-PASID attach PASIDs of PASID-capable device can be attached to hwpt separately, hence a pasid array to track per-PASID attachment is necessary. The index IOMMU_NO_PASID is used by the RID path. Hence drop the igroup->attach. Link: https://patch.msgid.link/r/20250321171940.7213-10-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit c0e301b2978d319d78ed332290989f3499ef9e63) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 59 +++++++++++++++++-------- drivers/iommu/iommufd/iommufd_private.h | 2 +- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 760917f5d7649..175f3d39baaad 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -27,7 +27,7 @@ static void iommufd_group_release(struct kref *kref) struct iommufd_group *igroup = container_of(kref, struct iommufd_group, ref); - WARN_ON(igroup->attach); + WARN_ON(!xa_empty(&igroup->pasid_attach)); xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup, NULL, GFP_KERNEL); @@ -94,6 +94,7 @@ static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx, kref_init(&new_igroup->ref); mutex_init(&new_igroup->lock); + xa_init(&new_igroup->pasid_attach); new_igroup->sw_msi_start = PHYS_ADDR_MAX; /* group reference moves into new_igroup */ new_igroup->group = group; @@ -297,16 +298,19 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD"); -static unsigned int iommufd_group_device_num(struct iommufd_group *igroup) +static unsigned int iommufd_group_device_num(struct iommufd_group *igroup, + ioasid_t pasid) { + struct iommufd_attach *attach; struct iommufd_device *idev; unsigned int count = 0; unsigned long index; lockdep_assert_held(&igroup->lock); - if (igroup->attach) - xa_for_each(&igroup->attach->device_array, index, idev) + attach = xa_load(&igroup->pasid_attach, pasid); + if (attach) + xa_for_each(&attach->device_array, index, idev) count++; return count; } @@ -351,7 +355,7 @@ static bool iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid) { lockdep_assert_held(&igroup->lock); - return !igroup->attach; + return !xa_load(&igroup->pasid_attach, pasid); } static int @@ -382,10 +386,13 @@ iommufd_device_attach_reserved_iova(struct iommufd_device *idev, /* The device attach/detach/replace helpers for attach_handle */ -/* Check if idev is attached to igroup->hwpt */ -static bool iommufd_device_is_attached(struct iommufd_device *idev) +static bool iommufd_device_is_attached(struct iommufd_device *idev, + ioasid_t pasid) { - return xa_load(&idev->igroup->attach->device_array, idev->obj.id); + struct iommufd_attach *attach; + + attach = xa_load(&idev->igroup->pasid_attach, pasid); + return xa_load(&attach->device_array, idev->obj.id); } static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, @@ -512,12 +519,18 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_lock(&igroup->lock); - attach = igroup->attach; + attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL, + XA_ZERO_ENTRY, GFP_KERNEL); + if (xa_is_err(attach)) { + rc = xa_err(attach); + goto err_unlock; + } + if (!attach) { attach = kzalloc(sizeof(*attach), GFP_KERNEL); if (!attach) { rc = -ENOMEM; - goto err_unlock; + goto err_release_pasid; } xa_init(&attach->device_array); } @@ -554,7 +567,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, if (rc) goto err_unresv; attach->hwpt = hwpt; - igroup->attach = attach; + WARN_ON(xa_is_err(xa_store(&igroup->pasid_attach, pasid, attach, + GFP_KERNEL))); } refcount_inc(&hwpt->obj.users); WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id, @@ -569,6 +583,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, err_free_attach: if (iommufd_group_first_attach(igroup, pasid)) kfree(attach); +err_release_pasid: + if (iommufd_group_first_attach(igroup, pasid)) + xa_release(&igroup->pasid_attach, pasid); err_unlock: mutex_unlock(&igroup->lock); return rc; @@ -583,14 +600,14 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) struct iommufd_attach *attach; mutex_lock(&igroup->lock); - attach = igroup->attach; + attach = xa_load(&igroup->pasid_attach, pasid); hwpt = attach->hwpt; hwpt_paging = find_hwpt_paging(hwpt); xa_erase(&attach->device_array, idev->obj.id); if (xa_empty(&attach->device_array)) { iommufd_hwpt_detach_device(hwpt, idev, pasid); - igroup->attach = NULL; + xa_erase(&igroup->pasid_attach, pasid); kfree(attach); } if (hwpt_paging && pasid == IOMMU_NO_PASID) @@ -617,12 +634,14 @@ static void iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { + struct iommufd_attach *attach; struct iommufd_device *cur; unsigned long index; lockdep_assert_held(&igroup->lock); - xa_for_each(&igroup->attach->device_array, index, cur) + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + xa_for_each(&attach->device_array, index, cur) iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); } @@ -631,15 +650,17 @@ iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup, struct iommufd_hwpt_paging *hwpt_paging) { struct iommufd_hwpt_paging *old_hwpt_paging; + struct iommufd_attach *attach; struct iommufd_device *cur; unsigned long index; int rc; lockdep_assert_held(&igroup->lock); - old_hwpt_paging = find_hwpt_paging(igroup->attach->hwpt); + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + old_hwpt_paging = find_hwpt_paging(attach->hwpt); if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) { - xa_for_each(&igroup->attach->device_array, index, cur) { + xa_for_each(&attach->device_array, index, cur) { rc = iopt_table_enforce_dev_resv_regions( &hwpt_paging->ioas->iopt, cur->dev, NULL); if (rc) @@ -672,7 +693,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, mutex_lock(&igroup->lock); - attach = igroup->attach; + attach = xa_load(&igroup->pasid_attach, pasid); if (!attach) { rc = -EINVAL; goto err_unlock; @@ -682,7 +703,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, WARN_ON(!old_hwpt || xa_empty(&attach->device_array)); - if (!iommufd_device_is_attached(idev)) { + if (!iommufd_device_is_attached(idev, pasid)) { rc = -EINVAL; goto err_unlock; } @@ -709,7 +730,7 @@ iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid, attach->hwpt = hwpt; - num_devices = iommufd_group_device_num(igroup); + num_devices = iommufd_group_device_num(igroup, pasid); /* * Move the refcounts held by the device_array to the new hwpt. Retain a * refcount for this thread as the caller will free it. diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 5b4d8962166bd..85467f53bdb29 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -406,7 +406,7 @@ struct iommufd_group { struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; - struct iommufd_attach *attach; + struct xarray pasid_attach; struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; }; From 0fe1a262f07c06b019bb0840f7161883b0d89b12 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:32 -0700 Subject: [PATCH 047/147] iommufd: Enforce PASID-compatible domain in PASID path AMD IOMMU requires attaching PASID-compatible domains to PASID-capable devices. This includes the domains attached to RID and PASIDs. Related discussions in link [1] and [2]. ARM also has such a requirement, Intel does not need it, but can live up with it. Hence, iommufd is going to enforce this requirement as it is not harmful to vendors that do not need it. Mark the PASID-compatible domains and enforce it in the PASID path. [1] https://lore.kernel.org/linux-iommu/20240709182303.GK14050@ziepe.ca/ [2] https://lore.kernel.org/linux-iommu/20240822124433.GD3468552@ziepe.ca/ Link: https://patch.msgid.link/r/20250321171940.7213-11-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit ff3f014ebb1e2fbafd407243e57fbad314472cc1) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 17 +++++++++++++++++ drivers/iommu/iommufd/hw_pagetable.c | 3 +++ drivers/iommu/iommufd/iommufd_private.h | 1 + 3 files changed, 21 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 175f3d39baaad..ba21b81e43bc3 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -395,6 +395,15 @@ static bool iommufd_device_is_attached(struct iommufd_device *idev, return xa_load(&attach->device_array, idev->obj.id); } +static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt, + struct iommufd_device *idev, + ioasid_t pasid) +{ + if (pasid != IOMMU_NO_PASID && !hwpt->pasid_compat) + return -EINVAL; + return 0; +} + static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev, ioasid_t pasid) @@ -404,6 +413,10 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, lockdep_assert_held(&idev->igroup->lock); + rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); + if (rc) + return rc; + handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) return -ENOMEM; @@ -472,6 +485,10 @@ static int iommufd_hwpt_replace_device(struct iommufd_device *idev, WARN_ON(pasid != IOMMU_NO_PASID); + rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); + if (rc) + return rc; + old_handle = iommufd_device_get_attach_handle(idev, pasid); handle = kzalloc(sizeof(*handle), GFP_KERNEL); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index bd9dd26a52950..3724533a23c96 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -136,6 +136,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, if (IS_ERR(hwpt_paging)) return ERR_CAST(hwpt_paging); hwpt = &hwpt_paging->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ @@ -244,6 +245,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; refcount_inc(&parent->common.obj.users); hwpt_nested->parent = parent; @@ -300,6 +302,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, if (IS_ERR(hwpt_nested)) return ERR_CAST(hwpt_nested); hwpt = &hwpt_nested->common; + hwpt->pasid_compat = flags & IOMMU_HWPT_ALLOC_PASID; hwpt_nested->viommu = viommu; refcount_inc(&viommu->obj.users); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 85467f53bdb29..80e8c76d25f23 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -299,6 +299,7 @@ struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; struct iommufd_fault *fault; + bool pasid_compat : 1; }; struct iommufd_hwpt_paging { From 568f4493f86863ea83b1d8528b67d8ecc498fc81 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:33 -0700 Subject: [PATCH 048/147] iommufd: Support pasid attach/replace This extends the below APIs to support PASID. Device drivers to manage pasid attach/replace/detach. int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, u32 *pt_id); int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); The pasid operations share underlying attach/replace/detach infrastructure with the device operations, but still have some different implications: - no reserved region per pasid otherwise SVA architecture is already broken (CPU address space doesn't count device reserved regions); - accordingly no sw_msi trick; Cache coherency enforcement is still applied to pasid operations since it is about memory accesses post page table walking (no matter the walk is per RID or per PASID). Link: https://patch.msgid.link/r/20250321171940.7213-12-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Signed-off-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 2fb69c602d57f77483b8dcdd12d17408a09f76fe) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 59 ++++++++++++++++++++------------ drivers/iommu/iommufd/selftest.c | 8 ++--- drivers/vfio/iommufd.c | 10 +++--- include/linux/iommufd.h | 9 +++-- 4 files changed, 53 insertions(+), 33 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index ba21b81e43bc3..4cc6de03f76ee 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -428,9 +428,12 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, } handle->idev = idev; - WARN_ON(pasid != IOMMU_NO_PASID); - rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, - &handle->handle); + if (pasid == IOMMU_NO_PASID) + rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group, + &handle->handle); + else + rc = iommu_attach_device_pasid(hwpt->domain, idev->dev, pasid, + &handle->handle); if (rc) goto out_disable_iopf; @@ -464,10 +467,12 @@ static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt, { struct iommufd_attach_handle *handle; - WARN_ON(pasid != IOMMU_NO_PASID); - handle = iommufd_device_get_attach_handle(idev, pasid); - iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + if (pasid == IOMMU_NO_PASID) + iommu_detach_group_handle(hwpt->domain, idev->igroup->group); + else + iommu_detach_device_pasid(hwpt->domain, idev->dev, pasid); + if (hwpt->fault) { iommufd_auto_response_faults(hwpt, handle); iommufd_fault_iopf_disable(idev); @@ -483,8 +488,6 @@ static int iommufd_hwpt_replace_device(struct iommufd_device *idev, struct iommufd_attach_handle *handle, *old_handle; int rc; - WARN_ON(pasid != IOMMU_NO_PASID); - rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); if (rc) return rc; @@ -502,8 +505,12 @@ static int iommufd_hwpt_replace_device(struct iommufd_device *idev, } handle->idev = idev; - rc = iommu_replace_group_handle(idev->igroup->group, hwpt->domain, - &handle->handle); + if (pasid == IOMMU_NO_PASID) + rc = iommu_replace_group_handle(idev->igroup->group, + hwpt->domain, &handle->handle); + else + rc = iommu_replace_device_pasid(hwpt->domain, idev->dev, + pasid, &handle->handle); if (rc) goto out_disable_iopf; @@ -904,22 +911,25 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, } /** - * iommufd_device_attach - Connect a device to an iommu_domain + * iommufd_device_attach - Connect a device/pasid to an iommu_domain * @idev: device to attach + * @pasid: pasid to attach * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING * Output the IOMMUFD_OBJ_HWPT_PAGING ID * - * This connects the device to an iommu_domain, either automatically or manually - * selected. Once this completes the device could do DMA. + * This connects the device/pasid to an iommu_domain, either automatically + * or manually selected. Once this completes the device could do DMA with + * @pasid. @pasid is IOMMU_NO_PASID if this attach is for no pasid usage. * * The caller should return the resulting pt_id back to userspace. * This function is undone by calling iommufd_device_detach(). */ -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id) { int rc; - rc = iommufd_device_change_pt(idev, IOMMU_NO_PASID, pt_id, + rc = iommufd_device_change_pt(idev, pasid, pt_id, &iommufd_device_do_attach); if (rc) return rc; @@ -934,8 +944,9 @@ int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); /** - * iommufd_device_replace - Change the device's iommu_domain + * iommufd_device_replace - Change the device/pasid's iommu_domain * @idev: device to change + * @pasid: pasid to change * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING * Output the IOMMUFD_OBJ_HWPT_PAGING ID * @@ -946,27 +957,31 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD"); * * If it fails then no change is made to the attachment. The iommu driver may * implement this so there is no disruption in translation. This can only be - * called if iommufd_device_attach() has already succeeded. + * called if iommufd_device_attach() has already succeeded. @pasid is + * IOMMU_NO_PASID for no pasid usage. */ -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id) +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id) { - return iommufd_device_change_pt(idev, IOMMU_NO_PASID, pt_id, + return iommufd_device_change_pt(idev, pasid, pt_id, &iommufd_device_do_replace); } EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD"); /** - * iommufd_device_detach - Disconnect a device to an iommu_domain + * iommufd_device_detach - Disconnect a device/device to an iommu_domain * @idev: device to detach + * @pasid: pasid to detach * * Undo iommufd_device_attach(). This disconnects the idev from the previously * attached pt_id. The device returns back to a blocked DMA translation. + * @pasid is IOMMU_NO_PASID for no pasid usage. */ -void iommufd_device_detach(struct iommufd_device *idev) +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid) { struct iommufd_hw_pagetable *hwpt; - hwpt = iommufd_hw_pagetable_detach(idev, IOMMU_NO_PASID); + hwpt = iommufd_hw_pagetable_detach(idev, pasid); iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index d55dde28e9bc4..0b3f5cbf242b1 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -945,7 +945,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, } sobj->idev.idev = idev; - rc = iommufd_device_attach(idev, &pt_id); + rc = iommufd_device_attach(idev, IOMMU_NO_PASID, &pt_id); if (rc) goto out_unbind; @@ -960,7 +960,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, return 0; out_detach: - iommufd_device_detach(idev); + iommufd_device_detach(idev, IOMMU_NO_PASID); out_unbind: iommufd_device_unbind(idev); out_mdev: @@ -994,7 +994,7 @@ static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, goto out_dev_obj; } - rc = iommufd_device_replace(sobj->idev.idev, &pt_id); + rc = iommufd_device_replace(sobj->idev.idev, IOMMU_NO_PASID, &pt_id); if (rc) goto out_dev_obj; @@ -1655,7 +1655,7 @@ void iommufd_selftest_destroy(struct iommufd_object *obj) switch (sobj->type) { case TYPE_IDEV: - iommufd_device_detach(sobj->idev.idev); + iommufd_device_detach(sobj->idev.idev, IOMMU_NO_PASID); iommufd_device_unbind(sobj->idev.idev); mock_dev_destroy(sobj->idev.mock_dev); break; diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 516294fd901be..37e1efa2c7bfe 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -128,7 +128,7 @@ void vfio_iommufd_physical_unbind(struct vfio_device *vdev) lockdep_assert_held(&vdev->dev_set->lock); if (vdev->iommufd_attached) { - iommufd_device_detach(vdev->iommufd_device); + iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; } iommufd_device_unbind(vdev->iommufd_device); @@ -146,9 +146,11 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return -EINVAL; if (vdev->iommufd_attached) - rc = iommufd_device_replace(vdev->iommufd_device, pt_id); + rc = iommufd_device_replace(vdev->iommufd_device, + IOMMU_NO_PASID, pt_id); else - rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + rc = iommufd_device_attach(vdev->iommufd_device, + IOMMU_NO_PASID, pt_id); if (rc) return rc; vdev->iommufd_attached = true; @@ -163,7 +165,7 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev) if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached) return; - iommufd_device_detach(vdev->iommufd_device); + iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 60eff9272551d..34b6e6ca4bfa0 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -54,9 +55,11 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); void iommufd_device_unbind(struct iommufd_device *idev); -int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); -int iommufd_device_replace(struct iommufd_device *idev, u32 *pt_id); -void iommufd_device_detach(struct iommufd_device *idev); +int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid, + u32 *pt_id); +void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); u32 iommufd_device_to_id(struct iommufd_device *idev); From c74a78c6487046fc86404fb5e218f82ca76b5361 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:34 -0700 Subject: [PATCH 049/147] iommufd: Enforce PASID-compatible domain for RID Per the definition of IOMMU_HWPT_ALLOC_PASID, iommufd needs to enforce the RID to use PASID-compatible domain if PASID has been attached, and vice versa. The PASID path has already enforced it. This adds the enforcement in the RID path. This enforcement requires a lock across the RID and PASID attach path, the idev->igroup->lock is used as both the RID and the PASID path holds it. Link: https://patch.msgid.link/r/20250321171940.7213-13-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 4c3f4f432c2d61ed266c797702bb58659f90bdff) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4cc6de03f76ee..1605f6c0e1eee 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -399,8 +399,28 @@ static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev, ioasid_t pasid) { - if (pasid != IOMMU_NO_PASID && !hwpt->pasid_compat) - return -EINVAL; + struct iommufd_group *igroup = idev->igroup; + + lockdep_assert_held(&igroup->lock); + + if (pasid == IOMMU_NO_PASID) { + unsigned long start = IOMMU_NO_PASID; + + if (!hwpt->pasid_compat && + xa_find_after(&igroup->pasid_attach, + &start, UINT_MAX, XA_PRESENT)) + return -EINVAL; + } else { + struct iommufd_attach *attach; + + if (!hwpt->pasid_compat) + return -EINVAL; + + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (attach && attach->hwpt && !attach->hwpt->pasid_compat) + return -EINVAL; + } + return 0; } @@ -411,8 +431,6 @@ static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt, struct iommufd_attach_handle *handle; int rc; - lockdep_assert_held(&idev->igroup->lock); - rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid); if (rc) return rc; From 2461f01f8f523151291f502f2d7361f5000f4979 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:35 -0700 Subject: [PATCH 050/147] iommu/vt-d: Add IOMMU_HWPT_ALLOC_PASID support Intel iommu driver just treats it as a nop since Intel VT-d does not have special requirement on domains attached to either the PASID or RID of a PASID-capable device. Link: https://patch.msgid.link/r/20250321171940.7213-14-yi.l.liu@intel.com Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit ce15c13e7a1423cf418f825d33ab1747b151cfd6) Signed-off-by: Nirmoy Das --- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/intel/nested.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 6ceed4b3091e7..3457b59e922b6 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3374,7 +3374,8 @@ intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags, bool first_stage; if (flags & - (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | + IOMMU_HWPT_ALLOC_PASID))) return ERR_PTR(-EOPNOTSUPP); if (nested_parent && !nested_supported(iommu)) return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index e80e32a0e973f..1e149169ee77b 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -198,7 +198,7 @@ intel_iommu_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, struct dmar_domain *domain; int ret; - if (!nested_supported(iommu) || flags) + if (!nested_supported(iommu) || flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); /* Must be nested domain */ From 3bb382e4803283541b3e0a91fd202ce465f87687 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:36 -0700 Subject: [PATCH 051/147] iommufd: Allow allocating PASID-compatible domain The underlying infrastructure has supported the PASID attach and related enforcement per the requirement of the IOMMU_HWPT_ALLOC_PASID flag. This extends iommufd to support PASID compatible domain requested by userspace. Link: https://patch.msgid.link/r/20250321171940.7213-15-yi.l.liu@intel.com Reviewed-by: Jason Gunthorpe Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit dbc5f37b4f8ad833132f77c1f67e68bb11ca9b9e) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/hw_pagetable.c | 7 ++++--- include/uapi/linux/iommufd.h | 3 +++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 3724533a23c96..487779470261a 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -112,7 +112,8 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, { const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_FAULT_ID_VALID; + IOMMU_HWPT_FAULT_ID_VALID | + IOMMU_HWPT_ALLOC_PASID; const struct iommu_ops *ops = dev_iommu_ops(idev->dev); struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; @@ -233,7 +234,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt; int rc; - if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) || + if ((flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) || !user_data->len || !ops->domain_alloc_nested) return ERR_PTR(-EOPNOTSUPP); if (parent->auto_domain || !parent->nest_parent || @@ -290,7 +291,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, struct iommufd_hw_pagetable *hwpt; int rc; - if (flags & ~IOMMU_HWPT_FAULT_ID_VALID) + if (flags & ~(IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID)) return ERR_PTR(-EOPNOTSUPP); if (!user_data->len) return ERR_PTR(-EOPNOTSUPP); diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 8719d4f5d6183..6901804ec736a 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -393,6 +393,9 @@ struct iommu_vfio_ioas { * Any domain attached to the non-PASID part of the * device must also be flagged, otherwise attaching a * PASID will blocked. + * For the user that wants to attach PASID, ioas is + * not recommended for both the non-PASID part + * and PASID part of the device. * If IOMMU does not support PASID it will return * error (-EOPNOTSUPP). */ From cb33fb883fa2b934b1ca8492e706a9cfbc0770e6 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:37 -0700 Subject: [PATCH 052/147] iommufd/selftest: Add set_dev_pasid in mock iommu The callback is needed to make pasid_attach/detach path complete for mock device. A nop is enough for set_dev_pasid. A MOCK_FLAGS_DEVICE_PASID is added to indicate a pasid-capable mock device for the pasid test cases. Other test cases will still create a non-pasid mock device. While the mock iommu always pretends to be pasid-capable. Link: https://patch.msgid.link/r/20250321171940.7213-16-yi.l.liu@intel.com Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 9eb59204d5197b4add63968c8c5b7633631f9a5a) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_test.h | 4 +++ drivers/iommu/iommufd/selftest.c | 37 ++++++++++++++++++++++++---- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 87e9165cea270..1a066feb8697e 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -49,6 +49,7 @@ enum { enum { MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, + MOCK_FLAGS_DEVICE_PASID = 1 << 2, }; enum { @@ -154,6 +155,9 @@ struct iommu_test_cmd { }; #define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) +/* Mock device/iommu PASID width */ +#define MOCK_PASID_WIDTH 20 + /* Mock structs for IOMMU_DEVICE_GET_HW_INFO ioctl */ #define IOMMU_HW_INFO_TYPE_SELFTEST 0xfeedbeef #define IOMMU_HW_INFO_SELFTEST_REGVAL 0xdeadbeef diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 0b3f5cbf242b1..aa3da0adc4e12 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -223,8 +223,16 @@ static int mock_domain_nop_attach(struct iommu_domain *domain, return 0; } +static int mock_domain_set_dev_pasid_nop(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid, + struct iommu_domain *old) +{ + return 0; +} + static const struct iommu_domain_ops mock_blocking_ops = { .attach_dev = mock_domain_nop_attach, + .set_dev_pasid = mock_domain_set_dev_pasid_nop }; static struct iommu_domain mock_blocking_domain = { @@ -366,7 +374,7 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, struct mock_iommu_domain_nested *mock_nested; struct mock_iommu_domain *mock_parent; - if (flags) + if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); if (!parent || parent->ops != mock_ops.default_domain_ops) return ERR_PTR(-EINVAL); @@ -388,7 +396,8 @@ mock_domain_alloc_paging_flags(struct device *dev, u32 flags, { bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING | - IOMMU_HWPT_ALLOC_NEST_PARENT; + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID; struct mock_dev *mdev = to_mock_dev(dev); bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; struct mock_iommu_domain *mock; @@ -608,7 +617,7 @@ mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, struct mock_viommu *mock_viommu = to_mock_viommu(viommu); struct mock_iommu_domain_nested *mock_nested; - if (flags) + if (flags & ~IOMMU_HWPT_ALLOC_PASID) return ERR_PTR(-EOPNOTSUPP); mock_nested = __mock_domain_alloc_nested(user_data); @@ -743,6 +752,7 @@ static const struct iommu_ops mock_ops = { .map_pages = mock_domain_map_pages, .unmap_pages = mock_domain_unmap_pages, .iova_to_phys = mock_domain_iova_to_phys, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, }, }; @@ -803,6 +813,7 @@ static struct iommu_domain_ops domain_nested_ops = { .free = mock_domain_free_nested, .attach_dev = mock_domain_nop_attach, .cache_invalidate_user = mock_domain_cache_invalidate_user, + .set_dev_pasid = mock_domain_set_dev_pasid_nop, }; static inline struct iommufd_hw_pagetable * @@ -862,11 +873,17 @@ static void mock_dev_release(struct device *dev) static struct mock_dev *mock_dev_create(unsigned long dev_flags) { + struct property_entry prop[] = { + PROPERTY_ENTRY_U32("pasid-num-bits", 0), + {}, + }; + const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | + MOCK_FLAGS_DEVICE_HUGE_IOVA | + MOCK_FLAGS_DEVICE_PASID; struct mock_dev *mdev; int rc, i; - if (dev_flags & - ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA)) + if (dev_flags & ~valid_flags) return ERR_PTR(-EINVAL); mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); @@ -890,6 +907,15 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags) if (rc) goto err_put; + if (dev_flags & MOCK_FLAGS_DEVICE_PASID) + prop[0] = PROPERTY_ENTRY_U32("pasid-num-bits", MOCK_PASID_WIDTH); + + rc = device_create_managed_software_node(&mdev->dev, prop, NULL); + if (rc) { + dev_err(&mdev->dev, "add pasid-num-bits property failed, rc: %d", rc); + goto err_put; + } + rc = device_add(&mdev->dev); if (rc) goto err_put; @@ -1778,6 +1804,7 @@ int __init iommufd_test_init(void) init_completion(&mock_iommu.complete); mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq"); + mock_iommu.iommu_dev.max_pasids = (1 << MOCK_PASID_WIDTH); return 0; From e60cbdbd524263e3ea3550620c5717d3ed347ad8 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:38 -0700 Subject: [PATCH 053/147] iommufd/selftest: Add a helper to get test device There is need to get the selftest device (sobj->type == TYPE_IDEV) in multiple places, so have a helper to for it. Link: https://patch.msgid.link/r/20250321171940.7213-17-yi.l.liu@intel.com Reviewed-by: Nicolin Chen Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 068e14025158986842f783147f9e41a59fbc97cd) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/selftest.c | 36 ++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index aa3da0adc4e12..04a4b84f5fa1b 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -996,39 +996,49 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, return rc; } -/* Replace the mock domain with a manually allocated hw_pagetable */ -static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, - unsigned int device_id, u32 pt_id, - struct iommu_test_cmd *cmd) +static struct selftest_obj * +iommufd_test_get_selftest_obj(struct iommufd_ctx *ictx, u32 id) { struct iommufd_object *dev_obj; struct selftest_obj *sobj; - int rc; /* * Prefer to use the OBJ_SELFTEST because the destroy_rwsem will ensure * it doesn't race with detach, which is not allowed. */ - dev_obj = - iommufd_get_object(ucmd->ictx, device_id, IOMMUFD_OBJ_SELFTEST); + dev_obj = iommufd_get_object(ictx, id, IOMMUFD_OBJ_SELFTEST); if (IS_ERR(dev_obj)) - return PTR_ERR(dev_obj); + return ERR_CAST(dev_obj); sobj = to_selftest_obj(dev_obj); if (sobj->type != TYPE_IDEV) { - rc = -EINVAL; - goto out_dev_obj; + iommufd_put_object(ictx, dev_obj); + return ERR_PTR(-EINVAL); } + return sobj; +} + +/* Replace the mock domain with a manually allocated hw_pagetable */ +static int iommufd_test_mock_domain_replace(struct iommufd_ucmd *ucmd, + unsigned int device_id, u32 pt_id, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, device_id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); rc = iommufd_device_replace(sobj->idev.idev, IOMMU_NO_PASID, &pt_id); if (rc) - goto out_dev_obj; + goto out_sobj; cmd->mock_domain_replace.pt_id = pt_id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); -out_dev_obj: - iommufd_put_object(ucmd->ictx, dev_obj); +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); return rc; } From 979bc3ed254766e90c61d8d4f8fc401ceb12037b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:39 -0700 Subject: [PATCH 054/147] iommufd/selftest: Add test ops to test pasid attach/detach This adds 4 test ops for pasid attach/replace/detach testing. There are ops to attach/detach pasid, and also op to check the attached hwpt of a pasid. Link: https://patch.msgid.link/r/20250321171940.7213-18-yi.l.liu@intel.com Reviewed-by: Kevin Tian Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit c1b52b0a97aeae22462496cda064323255d10b3b) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_test.h | 26 +++++ drivers/iommu/iommufd/selftest.c | 162 +++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 1a066feb8697e..1cd7e83941298 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -25,6 +25,10 @@ enum { IOMMU_TEST_OP_TRIGGER_IOPF, IOMMU_TEST_OP_DEV_CHECK_CACHE, IOMMU_TEST_OP_TRIGGER_VEVENT, + IOMMU_TEST_OP_PASID_ATTACH, + IOMMU_TEST_OP_PASID_REPLACE, + IOMMU_TEST_OP_PASID_DETACH, + IOMMU_TEST_OP_PASID_CHECK_HWPT, }; enum { @@ -62,6 +66,9 @@ enum { MOCK_DEV_CACHE_NUM = 4, }; +/* Reserved for special pasid replace test */ +#define IOMMU_TEST_PASID_RESERVED 1024 + struct iommu_test_cmd { __u32 size; __u32 op; @@ -150,6 +157,25 @@ struct iommu_test_cmd { struct { __u32 dev_id; } trigger_vevent; + struct { + __u32 pasid; + __u32 pt_id; + /* @id is stdev_id */ + } pasid_attach; + struct { + __u32 pasid; + __u32 pt_id; + /* @id is stdev_id */ + } pasid_replace; + struct { + __u32 pasid; + /* @id is stdev_id */ + } pasid_detach; + struct { + __u32 pasid; + __u32 hwpt_id; + /* @id is stdev_id */ + } pasid_check; }; __u32 last; }; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 04a4b84f5fa1b..18d9a216eb30d 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -167,6 +167,7 @@ struct mock_dev { unsigned long vdev_id; int id; u32 cache[MOCK_DEV_CACHE_NUM]; + atomic_t pasid_1024_fake_error; }; static inline struct mock_dev *to_mock_dev(struct device *dev) @@ -227,6 +228,34 @@ static int mock_domain_set_dev_pasid_nop(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old) { + struct mock_dev *mdev = to_mock_dev(dev); + + /* + * Per the first attach with pasid 1024, set the + * mdev->pasid_1024_fake_error. Hence the second call of this op + * can fake an error to validate the error path of the core. This + * is helpful to test the case in which the iommu core needs to + * rollback to the old domain due to driver failure. e.g. replace. + * User should be careful about the third call of this op, it shall + * succeed since the mdev->pasid_1024_fake_error is cleared in the + * second call. + */ + if (pasid == 1024) { + if (domain->type == IOMMU_DOMAIN_BLOCKED) { + atomic_set(&mdev->pasid_1024_fake_error, 0); + } else if (atomic_read(&mdev->pasid_1024_fake_error)) { + /* + * Clear the flag, and fake an error to fail the + * replacement. + */ + atomic_set(&mdev->pasid_1024_fake_error, 0); + return -ENOMEM; + } else { + /* Set the flag to fake an error in next call */ + atomic_set(&mdev->pasid_1024_fake_error, 1); + } + } + return 0; } @@ -1685,6 +1714,131 @@ static int iommufd_test_trigger_vevent(struct iommufd_ucmd *ucmd, return rc; } +static inline struct iommufd_hw_pagetable * +iommufd_get_hwpt(struct iommufd_ucmd *ucmd, u32 id) +{ + struct iommufd_object *pt_obj; + + pt_obj = iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return ERR_CAST(pt_obj); + + if (pt_obj->type != IOMMUFD_OBJ_HWPT_NESTED && + pt_obj->type != IOMMUFD_OBJ_HWPT_PAGING) { + iommufd_put_object(ucmd->ictx, pt_obj); + return ERR_PTR(-EINVAL); + } + + return container_of(pt_obj, struct iommufd_hw_pagetable, obj); +} + +static int iommufd_test_pasid_check_hwpt(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + u32 hwpt_id = cmd->pasid_check.hwpt_id; + struct iommu_domain *attached_domain; + struct iommu_attach_handle *handle; + struct iommufd_hw_pagetable *hwpt; + struct selftest_obj *sobj; + struct mock_dev *mdev; + int rc = 0; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + mdev = sobj->idev.mock_dev; + + handle = iommu_attach_handle_get(mdev->dev.iommu_group, + cmd->pasid_check.pasid, 0); + if (IS_ERR(handle)) + attached_domain = NULL; + else + attached_domain = handle->domain; + + /* hwpt_id == 0 means to check if pasid is detached */ + if (!hwpt_id) { + if (attached_domain) + rc = -EINVAL; + goto out_sobj; + } + + hwpt = iommufd_get_hwpt(ucmd, hwpt_id); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_sobj; + } + + if (attached_domain != hwpt->domain) + rc = -EINVAL; + + iommufd_put_object(ucmd->ictx, &hwpt->obj); +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_attach(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_attach(sobj->idev.idev, cmd->pasid_attach.pasid, + &cmd->pasid_attach.pt_id); + if (rc) + goto out_sobj; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + iommufd_device_detach(sobj->idev.idev, + cmd->pasid_attach.pasid); + +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_replace(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + int rc; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + rc = iommufd_device_replace(sobj->idev.idev, cmd->pasid_attach.pasid, + &cmd->pasid_attach.pt_id); + if (rc) + goto out_sobj; + + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_sobj: + iommufd_put_object(ucmd->ictx, &sobj->obj); + return rc; +} + +static int iommufd_test_pasid_detach(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + struct selftest_obj *sobj; + + sobj = iommufd_test_get_selftest_obj(ucmd->ictx, cmd->id); + if (IS_ERR(sobj)) + return PTR_ERR(sobj); + + iommufd_device_detach(sobj->idev.idev, cmd->pasid_detach.pasid); + iommufd_put_object(ucmd->ictx, &sobj->obj); + return 0; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = to_selftest_obj(obj); @@ -1768,6 +1922,14 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return iommufd_test_trigger_iopf(ucmd, cmd); case IOMMU_TEST_OP_TRIGGER_VEVENT: return iommufd_test_trigger_vevent(ucmd, cmd); + case IOMMU_TEST_OP_PASID_ATTACH: + return iommufd_test_pasid_attach(ucmd, cmd); + case IOMMU_TEST_OP_PASID_REPLACE: + return iommufd_test_pasid_replace(ucmd, cmd); + case IOMMU_TEST_OP_PASID_DETACH: + return iommufd_test_pasid_detach(ucmd, cmd); + case IOMMU_TEST_OP_PASID_CHECK_HWPT: + return iommufd_test_pasid_check_hwpt(ucmd, cmd); default: return -EOPNOTSUPP; } From b656d109fd064d479077873eca151d5ab9c0b111 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 10:19:40 -0700 Subject: [PATCH 055/147] iommufd/selftest: Add coverage for iommufd pasid attach/detach This tests iommufd pasid attach/replace/detach. Link: https://patch.msgid.link/r/20250321171940.7213-19-yi.l.liu@intel.com Signed-off-by: Yi Liu Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit d57a1fb3425513ec0b02acb9a9f81e5da99b4b85) Signed-off-by: Nirmoy Das --- tools/testing/selftests/iommu/iommufd.c | 301 ++++++++++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 49 ++- tools/testing/selftests/iommu/iommufd_utils.h | 97 +++++- 3 files changed, 437 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 156c74da53cd7..c39222b9869ba 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2996,4 +2996,305 @@ TEST_F(iommufd_viommu, vdevice_cache) } } +FIXTURE(iommufd_device_pasid) +{ + int fd; + uint32_t ioas_id; + uint32_t hwpt_id; + uint32_t stdev_id; + uint32_t device_id; + uint32_t no_pasid_stdev_id; + uint32_t no_pasid_device_id; +}; + +FIXTURE_VARIANT(iommufd_device_pasid) +{ + bool pasid_capable; +}; + +FIXTURE_SETUP(iommufd_device_pasid) +{ + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + test_ioctl_ioas_alloc(&self->ioas_id); + + test_cmd_mock_domain_flags(self->ioas_id, + MOCK_FLAGS_DEVICE_PASID, + &self->stdev_id, &self->hwpt_id, + &self->device_id); + if (!variant->pasid_capable) + test_cmd_mock_domain_flags(self->ioas_id, 0, + &self->no_pasid_stdev_id, NULL, + &self->no_pasid_device_id); +} + +FIXTURE_TEARDOWN(iommufd_device_pasid) +{ + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(iommufd_device_pasid, no_pasid) +{ + .pasid_capable = false, +}; + +FIXTURE_VARIANT_ADD(iommufd_device_pasid, has_pasid) +{ + .pasid_capable = true, +}; + +TEST_F(iommufd_device_pasid, pasid_attach) +{ + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + uint32_t nested_hwpt_id[3] = {}; + uint32_t parent_hwpt_id = 0; + uint32_t fault_id, fault_fd; + uint32_t s2_hwpt_id = 0; + uint32_t iopf_hwpt_id; + uint32_t pasid = 100; + uint32_t viommu_id; + + /* Allocate two nested hwpts sharing one common parent hwpt */ + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + &parent_hwpt_id); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_PASID, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, + &data, sizeof(data)); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_PASID, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, + &data, sizeof(data)); + + /* Fault related preparation */ + test_ioctl_fault_alloc(&fault_id, &fault_fd); + test_cmd_hwpt_alloc_iopf(self->device_id, parent_hwpt_id, fault_id, + IOMMU_HWPT_FAULT_ID_VALID | IOMMU_HWPT_ALLOC_PASID, + &iopf_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Allocate a regular nested hwpt based on viommu */ + test_cmd_viommu_alloc(self->device_id, parent_hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, + &viommu_id); + test_cmd_hwpt_alloc_nested(self->device_id, viommu_id, + IOMMU_HWPT_ALLOC_PASID, + &nested_hwpt_id[2], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_PASID, + &s2_hwpt_id); + + /* Attach RID to non-pasid compat domain, */ + test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id); + /* then attach to pasid should fail */ + test_err_pasid_attach(EINVAL, pasid, s2_hwpt_id); + + /* Attach RID to pasid compat domain, */ + test_cmd_mock_domain_replace(self->stdev_id, s2_hwpt_id); + /* then attach to pasid should succeed, */ + test_cmd_pasid_attach(pasid, nested_hwpt_id[0]); + /* but attach RID to non-pasid compat domain should fail now. */ + test_err_mock_domain_replace(EINVAL, self->stdev_id, parent_hwpt_id); + /* + * Detach hwpt from pasid 100, and check if the pasid 100 + * has null domain. + */ + test_cmd_pasid_detach(pasid); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + /* RID is attached to pasid-comapt domain, pasid path is not used */ + + if (!variant->pasid_capable) { + /* + * PASID-compatible domain can be used by non-PASID-capable + * device. + */ + test_cmd_mock_domain_replace(self->no_pasid_stdev_id, nested_hwpt_id[0]); + test_cmd_mock_domain_replace(self->no_pasid_stdev_id, self->ioas_id); + /* + * Attach hwpt to pasid 100 of non-PASID-capable device, + * should fail, no matter domain is pasid-comapt or not. + */ + EXPECT_ERRNO(EINVAL, + _test_cmd_pasid_attach(self->fd, self->no_pasid_stdev_id, + pasid, parent_hwpt_id)); + EXPECT_ERRNO(EINVAL, + _test_cmd_pasid_attach(self->fd, self->no_pasid_stdev_id, + pasid, s2_hwpt_id)); + } + + /* + * Attach non pasid compat hwpt to pasid-capable device, should + * fail, and have null domain. + */ + test_err_pasid_attach(EINVAL, pasid, parent_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + + /* + * Attach ioas to pasid 100, should fail, domain should + * be null. + */ + test_err_pasid_attach(EINVAL, pasid, self->ioas_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + + /* + * Attach the s2_hwpt to pasid 100, should succeed, domain should + * be valid. + */ + test_cmd_pasid_attach(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Try attach pasid 100 with another hwpt, should FAIL + * as attach does not allow overwrite, use REPLACE instead. + */ + test_err_pasid_attach(EBUSY, pasid, nested_hwpt_id[0]); + + /* + * Detach hwpt from pasid 100 for next test, should succeed, + * and have null domain. + */ + test_cmd_pasid_detach(pasid); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, 0)); + + /* + * Attach nested hwpt to pasid 100, should succeed, domain + * should be valid. + */ + test_cmd_pasid_attach(pasid, nested_hwpt_id[0]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, nested_hwpt_id[0])); + + /* Attach to pasid 100 which has been attached, should fail. */ + test_err_pasid_attach(EBUSY, pasid, nested_hwpt_id[0]); + + /* cleanup pasid 100 */ + test_cmd_pasid_detach(pasid); + + /* Replace tests */ + + pasid = 200; + /* + * Replace pasid 200 without attaching it, should fail + * with -EINVAL. + */ + test_err_pasid_replace(EINVAL, pasid, s2_hwpt_id); + + /* + * Attach the s2 hwpt to pasid 200, should succeed, domain should + * be valid. + */ + test_cmd_pasid_attach(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Replace pasid 200 with self->ioas_id, should fail + * and domain should be the prior s2 hwpt. + */ + test_err_pasid_replace(EINVAL, pasid, self->ioas_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Replace a nested hwpt for pasid 200, should succeed, + * and have valid domain. + */ + test_cmd_pasid_replace(pasid, nested_hwpt_id[0]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, nested_hwpt_id[0])); + + /* + * Replace with another nested hwpt for pasid 200, should + * succeed, and have valid domain. + */ + test_cmd_pasid_replace(pasid, nested_hwpt_id[1]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, nested_hwpt_id[1])); + + /* cleanup pasid 200 */ + test_cmd_pasid_detach(pasid); + + /* Negative Tests for pasid replace, use pasid 1024 */ + + /* + * Attach the s2 hwpt to pasid 1024, should succeed, domain should + * be valid. + */ + pasid = 1024; + test_cmd_pasid_attach(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* + * Replace pasid 1024 with nested_hwpt_id[0], should fail, + * but have the old valid domain. This is a designed + * negative case. Normally, this shall succeed. + */ + test_err_pasid_replace(ENOMEM, pasid, nested_hwpt_id[0]); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* cleanup pasid 1024 */ + test_cmd_pasid_detach(pasid); + + /* Attach to iopf-capable hwpt */ + + /* + * Attach an iopf hwpt to pasid 2048, should succeed, domain should + * be valid. + */ + pasid = 2048; + test_cmd_pasid_attach(pasid, iopf_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, iopf_hwpt_id)); + + test_cmd_trigger_iopf_pasid(self->device_id, pasid, fault_fd); + + /* + * Replace with s2_hwpt_id for pasid 2048, should + * succeed, and have valid domain. + */ + test_cmd_pasid_replace(pasid, s2_hwpt_id); + ASSERT_EQ(0, + test_cmd_pasid_check_hwpt(self->fd, self->stdev_id, + pasid, s2_hwpt_id)); + + /* cleanup pasid 2048 */ + test_cmd_pasid_detach(pasid); + + test_ioctl_destroy(iopf_hwpt_id); + close(fault_fd); + test_ioctl_destroy(fault_id); + + /* Detach the s2_hwpt_id from RID */ + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 99a7f7897bb28..8fd6f45000903 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -209,12 +209,16 @@ FIXTURE(basic_fail_nth) { int fd; uint32_t access_id; + uint32_t stdev_id; + uint32_t pasid; }; FIXTURE_SETUP(basic_fail_nth) { self->fd = -1; self->access_id = 0; + self->stdev_id = 0; + self->pasid = 0; //test should use a non-zero value } FIXTURE_TEARDOWN(basic_fail_nth) @@ -226,6 +230,8 @@ FIXTURE_TEARDOWN(basic_fail_nth) rc = _test_cmd_destroy_access(self->access_id); assert(rc == 0); } + if (self->pasid && self->stdev_id) + _test_cmd_pasid_detach(self->fd, self->stdev_id, self->pasid); teardown_iommufd(self->fd, _metadata); } @@ -622,9 +628,9 @@ TEST_FAIL_NTH(basic_fail_nth, device) uint32_t fault_id, fault_fd; uint32_t veventq_id, veventq_fd; uint32_t fault_hwpt_id; + uint32_t test_hwpt_id; uint32_t ioas_id; uint32_t ioas_id2; - uint32_t stdev_id; uint32_t idev_id; uint32_t hwpt_id; uint32_t viommu_id; @@ -655,25 +661,29 @@ TEST_FAIL_NTH(basic_fail_nth, device) fail_nth_enable(); - if (_test_cmd_mock_domain(self->fd, ioas_id, &stdev_id, NULL, - &idev_id)) + if (_test_cmd_mock_domain_flags(self->fd, ioas_id, + MOCK_FLAGS_DEVICE_PASID, + &self->stdev_id, NULL, &idev_id)) return -1; if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, 0, &hwpt_id, + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, + IOMMU_HWPT_ALLOC_PASID, &hwpt_id, IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; - if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) + if (_test_cmd_mock_domain_replace(self->fd, self->stdev_id, ioas_id2, NULL)) return -1; - if (_test_cmd_mock_domain_replace(self->fd, stdev_id, hwpt_id, NULL)) + if (_test_cmd_mock_domain_replace(self->fd, self->stdev_id, hwpt_id, NULL)) return -1; if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, - IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id, + IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_PASID, + &hwpt_id, IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; @@ -699,6 +709,31 @@ TEST_FAIL_NTH(basic_fail_nth, device) return -1; close(veventq_fd); + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, + IOMMU_HWPT_ALLOC_PASID, + &test_hwpt_id, + IOMMU_HWPT_DATA_NONE, 0, 0)) + return -1; + + /* Tests for pasid attach/replace/detach */ + + self->pasid = 200; + + if (_test_cmd_pasid_attach(self->fd, self->stdev_id, + self->pasid, hwpt_id)) { + self->pasid = 0; + return -1; + } + + if (_test_cmd_pasid_replace(self->fd, self->stdev_id, + self->pasid, test_hwpt_id)) + return -1; + + if (_test_cmd_pasid_detach(self->fd, self->stdev_id, self->pasid)) + return -1; + + self->pasid = 0; + return 0; } diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 6f2ba2fa8f76c..27794b6f58fc5 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -843,14 +843,15 @@ static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd) ASSERT_NE(0, *(fault_fd)); \ }) -static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd) +static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 pasid, + __u32 fault_fd) { struct iommu_test_cmd trigger_iopf_cmd = { .size = sizeof(trigger_iopf_cmd), .op = IOMMU_TEST_OP_TRIGGER_IOPF, .trigger_iopf = { .dev_id = device_id, - .pasid = 0x1, + .pasid = pasid, .grpid = 0x2, .perm = IOMMU_PGFAULT_PERM_READ | IOMMU_PGFAULT_PERM_WRITE, .addr = 0xdeadbeaf, @@ -881,7 +882,10 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd) } #define test_cmd_trigger_iopf(device_id, fault_fd) \ - ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd)) + ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, 0x1, fault_fd)) +#define test_cmd_trigger_iopf_pasid(device_id, pasid, fault_fd) \ + ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, \ + pasid, fault_fd)) static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, __u32 type, __u32 flags, __u32 *viommu_id) @@ -1051,3 +1055,90 @@ static int _test_cmd_read_vevents(int fd, __u32 event_fd, __u32 nvevents, EXPECT_ERRNO(_errno, \ _test_cmd_read_vevents(self->fd, event_fd, nvevents, \ virt_id, prev_seq)) + +static int _test_cmd_pasid_attach(int fd, __u32 stdev_id, __u32 pasid, + __u32 pt_id) +{ + struct iommu_test_cmd test_attach = { + .size = sizeof(test_attach), + .op = IOMMU_TEST_OP_PASID_ATTACH, + .id = stdev_id, + .pasid_attach = { + .pasid = pasid, + .pt_id = pt_id, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_ATTACH), + &test_attach); +} + +#define test_cmd_pasid_attach(pasid, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_pasid_attach(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +#define test_err_pasid_attach(_errno, pasid, hwpt_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_pasid_attach(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +static int _test_cmd_pasid_replace(int fd, __u32 stdev_id, __u32 pasid, + __u32 pt_id) +{ + struct iommu_test_cmd test_replace = { + .size = sizeof(test_replace), + .op = IOMMU_TEST_OP_PASID_REPLACE, + .id = stdev_id, + .pasid_replace = { + .pasid = pasid, + .pt_id = pt_id, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_REPLACE), + &test_replace); +} + +#define test_cmd_pasid_replace(pasid, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_pasid_replace(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +#define test_err_pasid_replace(_errno, pasid, hwpt_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_pasid_replace(self->fd, self->stdev_id, \ + pasid, hwpt_id)) + +static int _test_cmd_pasid_detach(int fd, __u32 stdev_id, __u32 pasid) +{ + struct iommu_test_cmd test_detach = { + .size = sizeof(test_detach), + .op = IOMMU_TEST_OP_PASID_DETACH, + .id = stdev_id, + .pasid_detach = { + .pasid = pasid, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_DETACH), + &test_detach); +} + +#define test_cmd_pasid_detach(pasid) \ + ASSERT_EQ(0, _test_cmd_pasid_detach(self->fd, self->stdev_id, pasid)) + +static int test_cmd_pasid_check_hwpt(int fd, __u32 stdev_id, __u32 pasid, + __u32 hwpt_id) +{ + struct iommu_test_cmd test_pasid_check = { + .size = sizeof(test_pasid_check), + .op = IOMMU_TEST_OP_PASID_CHECK_HWPT, + .id = stdev_id, + .pasid_check = { + .pasid = pasid, + .hwpt_id = hwpt_id, + }, + }; + + return ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_PASID_CHECK_HWPT), + &test_pasid_check); +} From 69d4624230ca906b389907db753087e7db126d59 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:39 -0700 Subject: [PATCH 056/147] ida: Add ida_find_first_range() There is no helpers for user to check if a given ID is allocated or not, neither a helper to loop all the allocated IDs in an IDA and do something for cleanup. With the two needs, a helper to get the lowest allocated ID of a range and two variants based on it. Caller can check if a given ID is allocated or not by: bool ida_exists(struct ida *ida, unsigned int id) Caller can iterate all allocated IDs by: int id; while ((id = ida_find_first(&pasid_ida)) >= 0) { //anything to do with the allocated ID ida_free(pasid_ida, pasid); } Link: https://patch.msgid.link/r/20250321180143.8468-2-yi.l.liu@intel.com Cc: Matthew Wilcox (Oracle) Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: Matthew Wilcox (Oracle) Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 7fe6b987166b901efc5c6fce5fe853c9ebb835be) Signed-off-by: Nirmoy Das --- include/linux/idr.h | 11 +++++++ lib/idr.c | 67 +++++++++++++++++++++++++++++++++++++++++++ lib/test_ida.c | 70 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) diff --git a/include/linux/idr.h b/include/linux/idr.h index da5f5fa4a3a6a..718f9b1b91afa 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -257,6 +257,7 @@ struct ida { int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); +int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max); /** * ida_alloc() - Allocate an unused ID. @@ -328,4 +329,14 @@ static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } + +static inline bool ida_exists(struct ida *ida, unsigned int id) +{ + return ida_find_first_range(ida, id, id) == id; +} + +static inline int ida_find_first(struct ida *ida) +{ + return ida_find_first_range(ida, 0, ~0); +} #endif /* __IDR_H__ */ diff --git a/lib/idr.c b/lib/idr.c index da36054c3ca02..e2adc457abb4b 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -476,6 +476,73 @@ int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, } EXPORT_SYMBOL(ida_alloc_range); +/** + * ida_find_first_range - Get the lowest used ID. + * @ida: IDA handle. + * @min: Lowest ID to get. + * @max: Highest ID to get. + * + * Get the lowest used ID between @min and @max, inclusive. The returned + * ID will not exceed %INT_MAX, even if @max is larger. + * + * Context: Any context. Takes and releases the xa_lock. + * Return: The lowest used ID, or errno if no used ID is found. + */ +int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max) +{ + unsigned long index = min / IDA_BITMAP_BITS; + unsigned int offset = min % IDA_BITMAP_BITS; + unsigned long *addr, size, bit; + unsigned long tmp = 0; + unsigned long flags; + void *entry; + int ret; + + if ((int)min < 0) + return -EINVAL; + if ((int)max < 0) + max = INT_MAX; + + xa_lock_irqsave(&ida->xa, flags); + + entry = xa_find(&ida->xa, &index, max / IDA_BITMAP_BITS, XA_PRESENT); + if (!entry) { + ret = -ENOENT; + goto err_unlock; + } + + if (index > min / IDA_BITMAP_BITS) + offset = 0; + if (index * IDA_BITMAP_BITS + offset > max) { + ret = -ENOENT; + goto err_unlock; + } + + if (xa_is_value(entry)) { + tmp = xa_to_value(entry); + addr = &tmp; + size = BITS_PER_XA_VALUE; + } else { + addr = ((struct ida_bitmap *)entry)->bitmap; + size = IDA_BITMAP_BITS; + } + + bit = find_next_bit(addr, size, offset); + + xa_unlock_irqrestore(&ida->xa, flags); + + if (bit == size || + index * IDA_BITMAP_BITS + bit > max) + return -ENOENT; + + return index * IDA_BITMAP_BITS + bit; + +err_unlock: + xa_unlock_irqrestore(&ida->xa, flags); + return ret; +} +EXPORT_SYMBOL(ida_find_first_range); + /** * ida_free() - Release an allocated ID. * @ida: IDA handle. diff --git a/lib/test_ida.c b/lib/test_ida.c index c80155a1956d2..63078f8dc13f5 100644 --- a/lib/test_ida.c +++ b/lib/test_ida.c @@ -189,6 +189,75 @@ static void ida_check_bad_free(struct ida *ida) IDA_BUG_ON(ida, !ida_is_empty(ida)); } +/* + * Check ida_find_first_range() and varriants. + */ +static void ida_check_find_first(struct ida *ida) +{ + /* IDA is empty; all of the below should be not exist */ + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, ida_exists(ida, 3)); + IDA_BUG_ON(ida, ida_exists(ida, 63)); + IDA_BUG_ON(ida, ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + /* IDA contains a single value entry */ + IDA_BUG_ON(ida, ida_alloc_min(ida, 3, GFP_KERNEL) != 3); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, ida_exists(ida, 63)); + IDA_BUG_ON(ida, ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + IDA_BUG_ON(ida, ida_alloc_min(ida, 63, GFP_KERNEL) != 63); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, !ida_exists(ida, 63)); + IDA_BUG_ON(ida, ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + /* IDA contains a single bitmap */ + IDA_BUG_ON(ida, ida_alloc_min(ida, 1023, GFP_KERNEL) != 1023); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, !ida_exists(ida, 63)); + IDA_BUG_ON(ida, !ida_exists(ida, 1023)); + IDA_BUG_ON(ida, ida_exists(ida, (1 << 20) - 1)); + + /* IDA contains a tree */ + IDA_BUG_ON(ida, ida_alloc_min(ida, (1 << 20) - 1, GFP_KERNEL) != (1 << 20) - 1); + IDA_BUG_ON(ida, ida_exists(ida, 0)); + IDA_BUG_ON(ida, !ida_exists(ida, 3)); + IDA_BUG_ON(ida, !ida_exists(ida, 63)); + IDA_BUG_ON(ida, !ida_exists(ida, 1023)); + IDA_BUG_ON(ida, !ida_exists(ida, (1 << 20) - 1)); + + /* Now try to find first */ + IDA_BUG_ON(ida, ida_find_first(ida) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, -1, 2) != -EINVAL); + IDA_BUG_ON(ida, ida_find_first_range(ida, 0, 2) != -ENOENT); // no used ID + IDA_BUG_ON(ida, ida_find_first_range(ida, 0, 3) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1, 3) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 3, 3) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 2, 4) != 3); + IDA_BUG_ON(ida, ida_find_first_range(ida, 4, 3) != -ENOENT); // min > max, fail + IDA_BUG_ON(ida, ida_find_first_range(ida, 4, 60) != -ENOENT); // no used ID + IDA_BUG_ON(ida, ida_find_first_range(ida, 4, 64) != 63); + IDA_BUG_ON(ida, ida_find_first_range(ida, 63, 63) != 63); + IDA_BUG_ON(ida, ida_find_first_range(ida, 64, 1026) != 1023); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1023, 1023) != 1023); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1023, (1 << 20) - 1) != 1023); + IDA_BUG_ON(ida, ida_find_first_range(ida, 1024, (1 << 20) - 1) != (1 << 20) - 1); + IDA_BUG_ON(ida, ida_find_first_range(ida, (1 << 20), INT_MAX) != -ENOENT); + + ida_free(ida, 3); + ida_free(ida, 63); + ida_free(ida, 1023); + ida_free(ida, (1 << 20) - 1); + + IDA_BUG_ON(ida, !ida_is_empty(ida)); +} + static DEFINE_IDA(ida); static int ida_checks(void) @@ -202,6 +271,7 @@ static int ida_checks(void) ida_check_max(&ida); ida_check_conv(&ida); ida_check_bad_free(&ida); + ida_check_find_first(&ida); printk("IDA: %u of %u tests passed\n", tests_passed, tests_run); return (tests_run != tests_passed) ? 0 : -EINVAL; From 5dbff6fb69362f6c610c02c5f56bef76248ffa7e Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:40 -0700 Subject: [PATCH 057/147] vfio-iommufd: Support pasid [at|de]tach for physical VFIO devices This adds pasid_at|de]tach_ioas ops for attaching hwpt to pasid of a device and the helpers for it. For now, only vfio-pci supports pasid attach/detach. Link: https://patch.msgid.link/r/20250321180143.8468-3-yi.l.liu@intel.com Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Williamson Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 290641346d0d1eaf400c4f968d5b2cd91f483733) Signed-off-by: Nirmoy Das --- drivers/vfio/iommufd.c | 50 +++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/vfio_pci.c | 2 ++ include/linux/vfio.h | 14 +++++++++++ 3 files changed, 66 insertions(+) diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 37e1efa2c7bfe..c8c3a2d53f86e 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -119,14 +119,22 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, if (IS_ERR(idev)) return PTR_ERR(idev); vdev->iommufd_device = idev; + ida_init(&vdev->pasids); return 0; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind); void vfio_iommufd_physical_unbind(struct vfio_device *vdev) { + int pasid; + lockdep_assert_held(&vdev->dev_set->lock); + while ((pasid = ida_find_first(&vdev->pasids)) >= 0) { + iommufd_device_detach(vdev->iommufd_device, pasid); + ida_free(&vdev->pasids, pasid); + } + if (vdev->iommufd_attached) { iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; @@ -170,6 +178,48 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev) } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id) +{ + int rc; + + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return -EINVAL; + + if (ida_exists(&vdev->pasids, pasid)) + return iommufd_device_replace(vdev->iommufd_device, + pasid, pt_id); + + rc = ida_alloc_range(&vdev->pasids, pasid, pasid, GFP_KERNEL); + if (rc < 0) + return rc; + + rc = iommufd_device_attach(vdev->iommufd_device, pasid, pt_id); + if (rc) + ida_free(&vdev->pasids, pasid); + + return rc; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_attach_ioas); + +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return; + + if (!ida_exists(&vdev->pasids, pasid)) + return; + + iommufd_device_detach(vdev->iommufd_device, pasid); + ida_free(&vdev->pasids, pasid); +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_detach_ioas); + /* * The emulated standard ops mean that vfio_device is going to use the * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index e727941f589de..6f7ae7e5b7b07 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -144,6 +144,8 @@ static const struct vfio_device_ops vfio_pci_ops = { .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, + .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas, + .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 000a6cab2d318..707b00772ce1f 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -67,6 +67,7 @@ struct vfio_device { struct inode *inode; #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; + struct ida pasids; u8 iommufd_attached:1; #endif u8 cdev_opened:1; @@ -91,6 +92,8 @@ struct vfio_device { * bound iommufd. Undo in unbind_iommufd if @detach_ioas is not * called. * @detach_ioas: Opposite of attach_ioas + * @pasid_attach_ioas: The pasid variation of attach_ioas + * @pasid_detach_ioas: Opposite of pasid_attach_ioas * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -115,6 +118,9 @@ struct vfio_device_ops { void (*unbind_iommufd)(struct vfio_device *vdev); int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); void (*detach_ioas)(struct vfio_device *vdev); + int (*pasid_attach_ioas)(struct vfio_device *vdev, u32 pasid, + u32 *pt_id); + void (*pasid_detach_ioas)(struct vfio_device *vdev, u32 pasid); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -139,6 +145,10 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id); +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid); int vfio_iommufd_emulated_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); @@ -166,6 +176,10 @@ vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) #define vfio_iommufd_physical_detach_ioas \ ((void (*)(struct vfio_device *vdev)) NULL) +#define vfio_iommufd_physical_pasid_attach_ioas \ + ((int (*)(struct vfio_device *vdev, u32 pasid, u32 *pt_id)) NULL) +#define vfio_iommufd_physical_pasid_detach_ioas \ + ((void (*)(struct vfio_device *vdev, u32 pasid)) NULL) #define vfio_iommufd_emulated_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) From 9fe8139985540ddcb6a90ca0851182948d7b6e1c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:41 -0700 Subject: [PATCH 058/147] vfio: VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT support pasid This extends the VFIO_DEVICE_[AT|DE]TACH_IOMMUFD_PT ioctls to attach/detach a given pasid of a vfio device to/from an IOAS/HWPT. Link: https://patch.msgid.link/r/20250321180143.8468-4-yi.l.liu@intel.com Reviewed-by: Alex Williamson Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit ad744ed5dd8b70e9256fc1ff18aaaffeedf5f21e) Signed-off-by: Nirmoy Das --- drivers/vfio/device_cdev.c | 60 +++++++++++++++++++++++++++++++++----- include/uapi/linux/vfio.h | 29 +++++++++++------- 2 files changed, 71 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index bb1817bd4ff31..281a8dc3ed497 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -162,9 +162,9 @@ void vfio_df_unbind_iommufd(struct vfio_device_file *df) int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, struct vfio_device_attach_iommufd_pt __user *arg) { - struct vfio_device *device = df->device; struct vfio_device_attach_iommufd_pt attach; - unsigned long minsz; + struct vfio_device *device = df->device; + unsigned long minsz, xend = 0; int ret; minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id); @@ -172,11 +172,34 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, if (copy_from_user(&attach, arg, minsz)) return -EFAULT; - if (attach.argsz < minsz || attach.flags) + if (attach.argsz < minsz) return -EINVAL; + if (attach.flags & ~VFIO_DEVICE_ATTACH_PASID) + return -EINVAL; + + if (attach.flags & VFIO_DEVICE_ATTACH_PASID) { + if (!device->ops->pasid_attach_ioas) + return -EOPNOTSUPP; + xend = offsetofend(struct vfio_device_attach_iommufd_pt, pasid); + } + + if (xend) { + if (attach.argsz < xend) + return -EINVAL; + + if (copy_from_user((void *)&attach + minsz, + (void __user *)arg + minsz, xend - minsz)) + return -EFAULT; + } + mutex_lock(&device->dev_set->lock); - ret = device->ops->attach_ioas(device, &attach.pt_id); + if (attach.flags & VFIO_DEVICE_ATTACH_PASID) + ret = device->ops->pasid_attach_ioas(device, + attach.pasid, + &attach.pt_id); + else + ret = device->ops->attach_ioas(device, &attach.pt_id); if (ret) goto out_unlock; @@ -198,20 +221,41 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, struct vfio_device_detach_iommufd_pt __user *arg) { - struct vfio_device *device = df->device; struct vfio_device_detach_iommufd_pt detach; - unsigned long minsz; + struct vfio_device *device = df->device; + unsigned long minsz, xend = 0; minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags); if (copy_from_user(&detach, arg, minsz)) return -EFAULT; - if (detach.argsz < minsz || detach.flags) + if (detach.argsz < minsz) return -EINVAL; + if (detach.flags & ~VFIO_DEVICE_DETACH_PASID) + return -EINVAL; + + if (detach.flags & VFIO_DEVICE_DETACH_PASID) { + if (!device->ops->pasid_detach_ioas) + return -EOPNOTSUPP; + xend = offsetofend(struct vfio_device_detach_iommufd_pt, pasid); + } + + if (xend) { + if (detach.argsz < xend) + return -EINVAL; + + if (copy_from_user((void *)&detach + minsz, + (void __user *)arg + minsz, xend - minsz)) + return -EFAULT; + } + mutex_lock(&device->dev_set->lock); - device->ops->detach_ioas(device); + if (detach.flags & VFIO_DEVICE_DETACH_PASID) + device->ops->pasid_detach_ioas(device, detach.pasid); + else + device->ops->detach_ioas(device); mutex_unlock(&device->dev_set->lock); return 0; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index c8dbf8219c4fc..6899da70b929f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -931,29 +931,34 @@ struct vfio_device_bind_iommufd { * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, * struct vfio_device_attach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for attach. * @pt_id: Input the target id which can represent an ioas or a hwpt * allocated via iommufd subsystem. * Output the input ioas id or the attached hwpt id which could * be the specified hwpt itself or a hwpt automatically created * for the specified ioas by kernel during the attachment. + * @pasid: The pasid to be attached, only meaningful when + * VFIO_DEVICE_ATTACH_PASID is set in @flags * * Associate the device with an address space within the bound iommufd. * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only * allowed on cdev fds. * - * If a vfio device is currently attached to a valid hw_pagetable, without doing - * a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl - * passing in another hw_pagetable (hwpt) id is allowed. This action, also known - * as a hw_pagetable replacement, will replace the device's currently attached - * hw_pagetable with a new hw_pagetable corresponding to the given pt_id. + * If a vfio device or a pasid of this device is currently attached to a valid + * hw_pagetable (hwpt), without doing a VFIO_DEVICE_DETACH_IOMMUFD_PT, a second + * VFIO_DEVICE_ATTACH_IOMMUFD_PT ioctl passing in another hwpt id is allowed. + * This action, also known as a hw_pagetable replacement, will replace the + * currently attached hwpt of the device or the pasid of this device with a new + * hwpt corresponding to the given pt_id. * * Return: 0 on success, -errno on failure. */ struct vfio_device_attach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_ATTACH_PASID (1 << 0) __u32 pt_id; + __u32 pasid; }; #define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) @@ -962,17 +967,21 @@ struct vfio_device_attach_iommufd_pt { * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, * struct vfio_device_detach_iommufd_pt) * @argsz: User filled size of this data. - * @flags: Must be 0. + * @flags: Flags for detach. + * @pasid: The pasid to be detached, only meaningful when + * VFIO_DEVICE_DETACH_PASID is set in @flags * - * Remove the association of the device and its current associated address - * space. After it, the device should be in a blocking DMA state. This is only - * allowed on cdev fds. + * Remove the association of the device or a pasid of the device and its current + * associated address space. After it, the device or the pasid should be in a + * blocking DMA state. This is only allowed on cdev fds. * * Return: 0 on success, -errno on failure. */ struct vfio_device_detach_iommufd_pt { __u32 argsz; __u32 flags; +#define VFIO_DEVICE_DETACH_PASID (1 << 0) + __u32 pasid; }; #define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) From 567453011383a5b7f1f7af23e11071b152becaf7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:42 -0700 Subject: [PATCH 059/147] iommufd: Extend IOMMU_GET_HW_INFO to report PASID capability PASID usage requires PASID support in both device and IOMMU. Since the iommu drivers always enable the PASID capability for the device if it is supported, this extends the IOMMU_GET_HW_INFO to report the PASID capability to userspace. Also, enhances the selftest accordingly. Link: https://patch.msgid.link/r/20250321180143.8468-5-yi.l.liu@intel.com Cc: Bjorn Helgaas Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Tested-by: Zhangfei Gao #aarch64 platform Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 803f97298e7de9242eb677a1351dcafbbcc9117e) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 34 +++++++++++++++++++++++++++++++++- drivers/pci/ats.c | 33 +++++++++++++++++++++++++++++++++ include/linux/pci-ats.h | 3 +++ include/uapi/linux/iommufd.h | 14 +++++++++++++- 4 files changed, 82 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 1605f6c0e1eee..2307daad65c0f 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -3,6 +3,7 @@ */ #include #include +#include #include #include @@ -1455,7 +1456,8 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) void *data; int rc; - if (cmd->flags || cmd->__reserved) + if (cmd->flags || cmd->__reserved[0] || cmd->__reserved[1] || + cmd->__reserved[2]) return -EOPNOTSUPP; idev = iommufd_get_device(ucmd, cmd->dev_id); @@ -1512,6 +1514,36 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + cmd->out_max_pasid_log2 = 0; + /* + * Currently, all iommu drivers enable PASID in the probe_device() + * op if iommu and device supports it. So the max_pasids stored in + * dev->iommu indicates both PASID support and enable status. A + * non-zero dev->iommu->max_pasids means PASID is supported and + * enabled. The iommufd only reports PASID capability to userspace + * if it's enabled. + */ + if (idev->dev->iommu->max_pasids) { + cmd->out_max_pasid_log2 = ilog2(idev->dev->iommu->max_pasids); + + if (dev_is_pci(idev->dev)) { + struct pci_dev *pdev = to_pci_dev(idev->dev); + int ctrl; + + ctrl = pci_pasid_status(pdev); + + WARN_ON_ONCE(ctrl < 0 || + !(ctrl & PCI_PASID_CTRL_ENABLE)); + + if (ctrl & PCI_PASID_CTRL_EXEC) + cmd->out_capabilities |= + IOMMU_HW_CAP_PCI_PASID_EXEC; + if (ctrl & PCI_PASID_CTRL_PRIV) + cmd->out_capabilities |= + IOMMU_HW_CAP_PCI_PASID_PRIV; + } + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 043221363f164..00603c2c4ff0e 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -540,4 +540,37 @@ int pci_max_pasids(struct pci_dev *pdev) return (1 << FIELD_GET(PCI_PASID_CAP_WIDTH, supported)); } EXPORT_SYMBOL_GPL(pci_max_pasids); + +/** + * pci_pasid_status - Check the PASID status + * @pdev: PCI device structure + * + * Returns a negative value when no PASID capability is present. + * Otherwise the value of the control register is returned. + * Status reported are: + * + * PCI_PASID_CTRL_ENABLE - PASID enabled + * PCI_PASID_CTRL_EXEC - Execute permission enabled + * PCI_PASID_CTRL_PRIV - Privileged mode enabled + */ +int pci_pasid_status(struct pci_dev *pdev) +{ + int pasid; + u16 ctrl; + + if (pdev->is_virtfn) + pdev = pci_physfn(pdev); + + pasid = pdev->pasid_cap; + if (!pasid) + return -EINVAL; + + pci_read_config_word(pdev, pasid + PCI_PASID_CTRL, &ctrl); + + ctrl &= PCI_PASID_CTRL_ENABLE | PCI_PASID_CTRL_EXEC | + PCI_PASID_CTRL_PRIV; + + return ctrl; +} +EXPORT_SYMBOL_GPL(pci_pasid_status); #endif /* CONFIG_PCI_PASID */ diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index 0e8b74e63767a..75c6c86cf09dc 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -42,6 +42,7 @@ int pci_enable_pasid(struct pci_dev *pdev, int features); void pci_disable_pasid(struct pci_dev *pdev); int pci_pasid_features(struct pci_dev *pdev); int pci_max_pasids(struct pci_dev *pdev); +int pci_pasid_status(struct pci_dev *pdev); #else /* CONFIG_PCI_PASID */ static inline int pci_enable_pasid(struct pci_dev *pdev, int features) { return -EINVAL; } @@ -50,6 +51,8 @@ static inline int pci_pasid_features(struct pci_dev *pdev) { return -EINVAL; } static inline int pci_max_pasids(struct pci_dev *pdev) { return -EINVAL; } +static inline int pci_pasid_status(struct pci_dev *pdev) +{ return -EINVAL; } #endif /* CONFIG_PCI_PASID */ #endif /* LINUX_PCI_ATS_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 6901804ec736a..e2c04e58a997d 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -612,9 +612,17 @@ enum iommu_hw_info_type { * IOMMU_HWPT_GET_DIRTY_BITMAP * IOMMU_HWPT_SET_DIRTY_TRACKING * + * @IOMMU_HW_CAP_PCI_PASID_EXEC: Execute Permission Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it + * when the struct + * iommu_hw_info::out_max_pasid_log2 is zero. */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, + IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, + IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; /** @@ -630,6 +638,9 @@ enum iommufd_hw_capabilities { * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined * in the enum iommu_hw_capabilities. + * @out_max_pasid_log2: Output the width of PASIDs. 0 means no PASID support. + * PCI devices turn to out_capabilities to check if the + * specific capabilities is supported or not. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -653,7 +664,8 @@ struct iommu_hw_info { __u32 data_len; __aligned_u64 data_uptr; __u32 out_data_type; - __u32 __reserved; + __u8 out_max_pasid_log2; + __u8 __reserved[3]; __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) From 91e976a99a54d37718a21659c83b15f2c2f93953 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 24 Mar 2025 05:00:33 -0700 Subject: [PATCH 060/147] iommufd: Initialize the flags of vevent in iommufd_viommu_report_event() The vevent->header.flags is not initialized per allocation, hence the vevent read path may treat the vevent as lost_events_header wrongly. Use kzalloc() to alloc memory for new vevent. Fixes: e8e1ef9b77a7 ("iommufd/viommu: Add iommufd_viommu_report_event helper") Link: https://patch.msgid.link/r/20250324120034.5940-2-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 41464a4628f3b15988bdc3dcd824c2e91064fc6f) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index a08ff0f37fc6d..922cd1fe7ec20 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -102,7 +102,7 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu, goto out_set_header; } - vevent = kmalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC); + vevent = kzalloc(struct_size(vevent, event_data, data_len), GFP_ATOMIC); if (!vevent) { rc = -ENOMEM; vevent = &veventq->lost_events_header; From 666066a49817e090cbe566d66e3a127fb1240cc7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Mon, 24 Mar 2025 05:00:34 -0700 Subject: [PATCH 061/147] iommufd: Balance veventq->num_events inc/dec iommufd_veventq_fops_read() decrements veventq->num_events when a vevent is read out. However, the report path ony increments veventq->num_events for normal events. To be balanced, make the read path decrement num_events only for normal vevents. Fixes: e36ba5ab808e ("iommufd: Add IOMMUFD_OBJ_VEVENTQ and IOMMUFD_CMD_VEVENTQ_ALLOC") Link: https://patch.msgid.link/r/20250324120034.5940-3-yi.l.liu@intel.com Signed-off-by: Yi Liu Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 6fc85bbbeaeae39c61d230ce279c0b0d0952d3e3) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/eventq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c index 4c43ace8c725d..f39cf07973476 100644 --- a/drivers/iommu/iommufd/eventq.c +++ b/drivers/iommu/iommufd/eventq.c @@ -385,7 +385,8 @@ static ssize_t iommufd_veventq_fops_read(struct file *filep, char __user *buf, break; } spin_lock(&eventq->lock); - veventq->num_events--; + if (!vevent_for_lost_events_header(cur)) + veventq->num_events--; spin_unlock(&eventq->lock); done += cur->data_len; kfree(cur); From 3405f8b87b9bdba45364d574b499e026da81a28c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 26 Mar 2025 22:28:46 -0700 Subject: [PATCH 062/147] iommu: Convert unreachable() to BUG() Bare unreachable() should be avoided as it generates undefined behavior, e.g. falling through to the next function. Use BUG() instead so the error is defined. Fixes the following warnings: drivers/iommu/dma-iommu.o: warning: objtool: iommu_dma_sw_msi+0x92: can't find jump dest instruction at .text+0x54d5 vmlinux.o: warning: objtool: iommu_dma_get_msi_page() falls through to next function __iommu_dma_unmap() Link: https://patch.msgid.link/r/0c801ae017ec078cacd39f8f0898fc7780535f85.1743053325.git.jpoimboe@kernel.org Reported-by: Randy Dunlap Closes: https://lore.kernel.org/314f8809-cd59-479b-97d7-49356bf1c8d1@infradead.org Reported-by: Paul E. McKenney Closes: https://lore.kernel.org/5dd1f35e-8ece-43b7-ad6d-86d02d2718f6@paulmck-laptop Fixes: 6aa63a4ec947 ("iommu: Sort out domain user data") Signed-off-by: Josh Poimboeuf Signed-off-by: Jason Gunthorpe (cherry picked from commit 3a2ffd3f3e1b6df4ed7b35f98565c1ad0fe54840) Signed-off-by: Nirmoy Das --- drivers/iommu/dma-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 2bd9f80a83fe4..8cc5397d7dfc1 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1762,7 +1762,7 @@ static size_t cookie_msi_granule(const struct iommu_domain *domain) case IOMMU_COOKIE_DMA_MSI: return PAGE_SIZE; default: - unreachable(); + BUG(); }; } @@ -1774,7 +1774,7 @@ static struct list_head *cookie_msi_pages(const struct iommu_domain *domain) case IOMMU_COOKIE_DMA_MSI: return &domain->msi_cookie->msi_page_list; default: - unreachable(); + BUG(); }; } From eee9efe81fd53ffa12cab531bca8090b069bd8c0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 28 Mar 2025 06:34:48 -0700 Subject: [PATCH 063/147] iommufd: Test attach before detaching pasid Check if the pasid has been attached before going further in the detach path. This fixes a crash found by syzkaller. Add a selftest as well. Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 1 UID: 0 PID: 668 Comm: repro Not tainted 6.14.0-next-20250325-eb4bc4b07f66 #1 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org4 RIP: 0010:iommufd_hw_pagetable_detach+0x8a/0x4d0 Code: 00 00 00 44 89 ee 48 89 c7 48 89 75 c8 48 89 45 c0 e8 ca 55 17 02 48 89 c2 49 89 c4 48 b8 00 00 00b RSP: 0018:ffff888021b17b78 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff888014b5a000 RCX: ffff888021b17a64 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88801dad07fc RBP: ffff888021b17bc8 R08: 0000000000000001 R09: 0000000000000001 R10: 0000000000000001 R11: ffff88801dad0e58 R12: 0000000000000000 R13: 0000000000000001 R14: ffff888021b17e18 R15: ffff8880132d3008 FS: 00007fca52013600(0000) GS:ffff8880e3684000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000200006c0 CR3: 00000000112d0005 CR4: 0000000000770ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: iommufd_device_detach+0x2a/0x2e0 iommufd_test+0x2f99/0x5cd0 iommufd_fops_ioctl+0x38e/0x520 __x64_sys_ioctl+0x1ba/0x220 x64_sys_call+0x122e/0x2150 do_syscall_64+0x6d/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e Link: https://patch.msgid.link/r/20250328133448.22052-1-yi.l.liu@intel.com Reported-by: Lai Yi Closes: https://lore.kernel.org/linux-iommu/Z+X0tzxhiaupJT7b@ly-workstation Fixes: c0e301b2978d ("iommufd/device: Add pasid_attach array to track per-PASID attach") Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 7be11d34f660bfa6583f3d6e2032d5dcbff56081) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 7 +++++++ tools/testing/selftests/iommu/iommufd.c | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 2307daad65c0f..2111bad72c720 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -644,6 +644,11 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid) mutex_lock(&igroup->lock); attach = xa_load(&igroup->pasid_attach, pasid); + if (!attach) { + mutex_unlock(&igroup->lock); + return NULL; + } + hwpt = attach->hwpt; hwpt_paging = find_hwpt_paging(hwpt); @@ -1001,6 +1006,8 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid) struct iommufd_hw_pagetable *hwpt; hwpt = iommufd_hw_pagetable_detach(idev, pasid); + if (!hwpt) + return; iommufd_hw_pagetable_put(idev->ictx, hwpt); refcount_dec(&idev->obj.users); } diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index c39222b9869ba..3e0c3f3c53734 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -3056,6 +3056,12 @@ TEST_F(iommufd_device_pasid, pasid_attach) uint32_t pasid = 100; uint32_t viommu_id; + /* + * Negative, detach pasid without attaching, this is not expected. + * But it should not result in failure anyway. + */ + test_cmd_pasid_detach(pasid); + /* Allocate two nested hwpts sharing one common parent hwpt */ test_cmd_hwpt_alloc(self->device_id, self->ioas_id, IOMMU_HWPT_ALLOC_NEST_PARENT, From 7eba1232cf49a5314708f50830c4d7346024425c Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Wed, 9 Apr 2025 00:33:41 +0300 Subject: [PATCH 064/147] iommu: Fix crash in report_iommu_fault() The following crash is observed while handling an IOMMU fault with a recent kernel: kernel tried to execute NX-protected page - exploit attempt? (uid: 0) BUG: unable to handle page fault for address: ffff8c708299f700 PGD 19ee01067 P4D 19ee01067 PUD 101c10063 PMD 80000001028001e3 Oops: Oops: 0011 [#1] SMP NOPTI CPU: 4 UID: 0 PID: 139 Comm: irq/25-AMD-Vi Not tainted 6.15.0-rc1+ #20 PREEMPT(lazy) Hardware name: LENOVO 21D0/LNVNB161216, BIOS J6CN50WW 09/27/2024 RIP: 0010:0xffff8c708299f700 Call Trace: ? report_iommu_fault+0x78/0xd3 ? amd_iommu_report_page_fault+0x91/0x150 ? amd_iommu_int_thread+0x77/0x180 ? __pfx_irq_thread_fn+0x10/0x10 ? irq_thread_fn+0x23/0x60 ? irq_thread+0xf9/0x1e0 ? __pfx_irq_thread_dtor+0x10/0x10 ? __pfx_irq_thread+0x10/0x10 ? kthread+0xfc/0x240 ? __pfx_kthread+0x10/0x10 ? ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ? ret_from_fork_asm+0x1a/0x30 report_iommu_fault() checks for an installed handler comparing the corresponding field to NULL. It can (and could before) be called for a domain with a different cookie type - IOMMU_COOKIE_DMA_IOVA, specifically. Cookie is represented as a union so we may end up with a garbage value treated there if this happens for a domain with another cookie type. Formerly there were two exclusive cookie types in the union. IOMMU_DOMAIN_SVA has a dedicated iommu_report_device_fault(). Call the fault handler only if the passed domain has a required cookie type. Found by Linux Verification Center (linuxtesting.org). Fixes: 6aa63a4ec947 ("iommu: Sort out domain user data") Signed-off-by: Fedor Pchelkin Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250408213342.285955-1-pchelkin@ispras.ru Signed-off-by: Joerg Roedel (cherry picked from commit df4bf3fa1b1e8d03380206fa027f956a62de517b) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5cbcc95f7ba88..651315a748c3d 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2874,7 +2874,8 @@ int report_iommu_fault(struct iommu_domain *domain, struct device *dev, * if upper layers showed interest and installed a fault handler, * invoke it. */ - if (domain->handler) + if (domain->cookie_type == IOMMU_COOKIE_FAULT_HANDLER && + domain->handler) ret = domain->handler(domain, dev, iova, flags, domain->handler_token); From 80b61ca6aaee6b2796beaef7c276a65ed59219d7 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Fri, 21 Mar 2025 11:01:43 -0700 Subject: [PATCH 065/147] iommufd/selftest: Add coverage for reporting max_pasid_log2 via IOMMU_HW_INFO IOMMU_HW_INFO is extended to report max_pasid_log2, hence add coverage for it. Link: https://patch.msgid.link/r/20250321180143.8468-6-yi.l.liu@intel.com Reviewed-by: Nicolin Chen Tested-by: Nicolin Chen Signed-off-by: Yi Liu Signed-off-by: Jason Gunthorpe (cherry picked from commit 6d9500bb1ff8c7f9c3ce199521c41aa41e8fd994) Signed-off-by: Nirmoy Das --- tools/testing/selftests/iommu/iommufd.c | 18 ++++++++++++++++++ .../testing/selftests/iommu/iommufd_fail_nth.c | 3 ++- tools/testing/selftests/iommu/iommufd_utils.h | 17 +++++++++++++---- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 3e0c3f3c53734..1a8e85afe9aa5 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -342,12 +342,14 @@ FIXTURE(iommufd_ioas) uint32_t hwpt_id; uint32_t device_id; uint64_t base_iova; + uint32_t device_pasid_id; }; FIXTURE_VARIANT(iommufd_ioas) { unsigned int mock_domains; unsigned int memory_limit; + bool pasid_capable; }; FIXTURE_SETUP(iommufd_ioas) @@ -372,6 +374,12 @@ FIXTURE_SETUP(iommufd_ioas) IOMMU_TEST_DEV_CACHE_DEFAULT); self->base_iova = MOCK_APERTURE_START; } + + if (variant->pasid_capable) + test_cmd_mock_domain_flags(self->ioas_id, + MOCK_FLAGS_DEVICE_PASID, + NULL, NULL, + &self->device_pasid_id); } FIXTURE_TEARDOWN(iommufd_ioas) @@ -387,6 +395,7 @@ FIXTURE_VARIANT_ADD(iommufd_ioas, no_domain) FIXTURE_VARIANT_ADD(iommufd_ioas, mock_domain) { .mock_domains = 1, + .pasid_capable = true, }; FIXTURE_VARIANT_ADD(iommufd_ioas, two_mock_domain) @@ -752,6 +761,8 @@ TEST_F(iommufd_ioas, get_hw_info) } buffer_smaller; if (self->device_id) { + uint8_t max_pasid = 0; + /* Provide a zero-size user_buffer */ test_cmd_get_hw_info(self->device_id, NULL, 0); /* Provide a user_buffer with exact size */ @@ -766,6 +777,13 @@ TEST_F(iommufd_ioas, get_hw_info) * the fields within the size range still gets updated. */ test_cmd_get_hw_info(self->device_id, &buffer_smaller, sizeof(buffer_smaller)); + test_cmd_get_hw_info_pasid(self->device_id, &max_pasid); + ASSERT_EQ(0, max_pasid); + if (variant->pasid_capable) { + test_cmd_get_hw_info_pasid(self->device_pasid_id, + &max_pasid); + ASSERT_EQ(MOCK_PASID_WIDTH, max_pasid); + } } else { test_err_get_hw_info(ENOENT, self->device_id, &buffer_exact, sizeof(buffer_exact)); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 8fd6f45000903..e11ec4b121fc3 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -666,7 +666,8 @@ TEST_FAIL_NTH(basic_fail_nth, device) &self->stdev_id, NULL, &idev_id)) return -1; - if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) + if (_test_cmd_get_hw_info(self->fd, idev_id, &info, + sizeof(info), NULL, NULL)) return -1; if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 27794b6f58fc5..72f6636e5d909 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -758,7 +758,8 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata) /* @data can be NULL */ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, - size_t data_len, uint32_t *capabilities) + size_t data_len, uint32_t *capabilities, + uint8_t *max_pasid) { struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data; struct iommu_hw_info cmd = { @@ -803,6 +804,9 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, assert(!info->flags); } + if (max_pasid) + *max_pasid = cmd.out_max_pasid_log2; + if (capabilities) *capabilities = cmd.out_capabilities; @@ -811,14 +815,19 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, #define test_cmd_get_hw_info(device_id, data, data_len) \ ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \ - data_len, NULL)) + data_len, NULL, NULL)) #define test_err_get_hw_info(_errno, device_id, data, data_len) \ EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \ - data_len, NULL)) + data_len, NULL, NULL)) #define test_cmd_get_hw_capabilities(device_id, caps, mask) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps)) + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, \ + 0, &caps, NULL)) + +#define test_cmd_get_hw_info_pasid(device_id, max_pasid) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, \ + 0, NULL, max_pasid)) static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd) { From 90f069cf789c60a3e0b94692354eac5597e87979 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:47 -0300 Subject: [PATCH 066/147] iommu/virtio: Break out bypass identity support into a global static To make way for a domain_alloc_paging conversion add the typical global static IDENTITY domain. This supports VMMs that have a VIRTIO_IOMMU_F_BYPASS_CONFIG config. If the VMM does not have support then the domain_alloc path is still used, which creates an IDENTITY domain out of a paging domain. Reviewed-by: Jean-Philippe Brucker Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/1-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit 0d76a6edae9eeae5364285296b69d73dacb152f3) Signed-off-by: Nirmoy Das --- drivers/iommu/virtio-iommu.c | 86 ++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index b85ce6310ddbd..55a2188197c62 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -48,6 +48,7 @@ struct viommu_dev { u64 pgsize_bitmap; u32 first_domain; u32 last_domain; + u32 identity_domain_id; /* Supported MAP flags */ u32 map_flags; u32 probe_size; @@ -70,7 +71,6 @@ struct viommu_domain { struct rb_root_cached mappings; unsigned long nr_endpoints; - bool bypass; }; struct viommu_endpoint { @@ -305,6 +305,22 @@ static int viommu_send_req_sync(struct viommu_dev *viommu, void *buf, return ret; } +static int viommu_send_attach_req(struct viommu_dev *viommu, struct device *dev, + struct virtio_iommu_req_attach *req) +{ + int ret; + unsigned int i; + struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); + + for (i = 0; i < fwspec->num_ids; i++) { + req->endpoint = cpu_to_le32(fwspec->ids[i]); + ret = viommu_send_req_sync(viommu, req, sizeof(*req)); + if (ret) + return ret; + } + return 0; +} + /* * viommu_add_mapping - add a mapping to the internal tree * @@ -687,12 +703,6 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, vdomain->viommu = viommu; if (domain->type == IOMMU_DOMAIN_IDENTITY) { - if (virtio_has_feature(viommu->vdev, - VIRTIO_IOMMU_F_BYPASS_CONFIG)) { - vdomain->bypass = true; - return 0; - } - ret = viommu_domain_map_identity(vdev, vdomain); if (ret) { ida_free(&viommu->domain_ids, vdomain->id); @@ -719,10 +729,8 @@ static void viommu_domain_free(struct iommu_domain *domain) static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) { - int i; int ret = 0; struct virtio_iommu_req_attach req; - struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); struct viommu_domain *vdomain = to_viommu_domain(domain); @@ -761,16 +769,9 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) .domain = cpu_to_le32(vdomain->id), }; - if (vdomain->bypass) - req.flags |= cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS); - - for (i = 0; i < fwspec->num_ids; i++) { - req.endpoint = cpu_to_le32(fwspec->ids[i]); - - ret = viommu_send_req_sync(vdomain->viommu, &req, sizeof(req)); - if (ret) - return ret; - } + ret = viommu_send_attach_req(vdomain->viommu, dev, &req); + if (ret) + return ret; if (!vdomain->nr_endpoints) { /* @@ -788,6 +789,40 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) return 0; } +static int viommu_attach_identity_domain(struct iommu_domain *domain, + struct device *dev) +{ + int ret = 0; + struct virtio_iommu_req_attach req; + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct viommu_domain *vdomain = to_viommu_domain(domain); + + req = (struct virtio_iommu_req_attach) { + .head.type = VIRTIO_IOMMU_T_ATTACH, + .domain = cpu_to_le32(vdev->viommu->identity_domain_id), + .flags = cpu_to_le32(VIRTIO_IOMMU_ATTACH_F_BYPASS), + }; + + ret = viommu_send_attach_req(vdev->viommu, dev, &req); + if (ret) + return ret; + + if (vdev->vdomain) + vdev->vdomain->nr_endpoints--; + vdomain->nr_endpoints++; + vdev->vdomain = vdomain; + return 0; +} + +static struct viommu_domain viommu_identity_domain = { + .domain = { + .type = IOMMU_DOMAIN_IDENTITY, + .ops = &(const struct iommu_domain_ops) { + .attach_dev = viommu_attach_identity_domain, + }, + }, +}; + static void viommu_detach_dev(struct viommu_endpoint *vdev) { int i; @@ -1061,6 +1096,7 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap) } static struct iommu_ops viommu_ops = { + .identity_domain = &viommu_identity_domain.domain, .capable = viommu_capable, .domain_alloc = viommu_domain_alloc, .probe_device = viommu_probe_device, @@ -1184,6 +1220,18 @@ static int viommu_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_IOMMU_F_MMIO)) viommu->map_flags |= VIRTIO_IOMMU_MAP_F_MMIO; + /* Reserve an ID to use as the bypass domain */ + if (virtio_has_feature(viommu->vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) { + viommu->identity_domain_id = viommu->first_domain; + viommu->first_domain++; + } else { + /* + * Assume the VMM is sensible and it either supports bypass on + * all instances or no instances. + */ + viommu_ops.identity_domain = NULL; + } + viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; virtio_device_ready(vdev); From 5f83b7cafdd3c334153fad1092d6ce2b27f4d9b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:48 -0300 Subject: [PATCH 067/147] iommu: Add domain_alloc_identity() virtio-iommu has a mode where the IDENTITY domain is actually a paging domain with an identity mapping covering some of the system address space manually created. To support this add a new domain_alloc_identity() op that accepts the struct device so that virtio can allocate and fully finalize a paging domain to return. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/2-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit 0d609a1450fab636c825c66344ab0ecfc1d3a98c) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 20 ++++++++++++-------- include/linux/iommu.h | 4 ++++ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 651315a748c3d..9df94b32a2add 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1783,15 +1783,19 @@ static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) if (ops->identity_domain) return ops->identity_domain; - /* Older drivers create the identity domain via ops->domain_alloc() */ - if (!ops->domain_alloc) + if (ops->domain_alloc_identity) { + domain = ops->domain_alloc_identity(dev); + if (IS_ERR(domain)) + return domain; + } else if (ops->domain_alloc) { + domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY); + if (!domain) + return ERR_PTR(-ENOMEM); + if (IS_ERR(domain)) + return domain; + } else { return ERR_PTR(-EOPNOTSUPP); - - domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY); - if (IS_ERR(domain)) - return domain; - if (!domain) - return ERR_PTR(-ENOMEM); + } iommu_domain_init(domain, IOMMU_DOMAIN_IDENTITY, ops); return domain; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3a8d35d41fdad..91b16c69d0614 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -572,6 +572,9 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @domain_alloc: allocate and return an iommu domain if success. Otherwise * NULL is returned. The domain is not fully initialized until * the caller iommu_domain_alloc() returns. + * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to + * use identity_domain instead. This should only be used + * if dynamic logic is necessary. * @domain_alloc_paging_flags: Allocate an iommu domain corresponding to the * input parameters as defined in * include/uapi/linux/iommufd.h. The @user_data can be @@ -630,6 +633,7 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); + struct iommu_domain *(*domain_alloc_identity)(struct device *dev); struct iommu_domain *(*domain_alloc_paging_flags)( struct device *dev, u32 flags, const struct iommu_user_data *user_data); From a0f8f720ab61dbdbaaf3842cb11e50b4c98a0ddb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:49 -0300 Subject: [PATCH 068/147] iommu/virtio: Move to domain_alloc_paging() virtio has the complication that it sometimes wants to return a paging domain for IDENTITY which makes this conversion a little different than other drivers. Add a viommu_domain_alloc_paging() that combines viommu_domain_alloc() and viommu_domain_finalise() to always return a fully initialized and finalized paging domain. Use viommu_domain_alloc_identity() to implement the special non-bypass IDENTITY flow by calling viommu_domain_alloc_paging() then viommu_domain_map_identity(). Remove support for deferred finalize and the vdomain->mutex. Remove core support for domain_alloc() IDENTITY as virtio was the last driver using it. Reviewed-by: Jean-Philippe Brucker Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/3-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit 07107e74444bedb943ee91ec6072c3baf62f4ae7) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 6 -- drivers/iommu/virtio-iommu.c | 115 +++++++++++++++-------------------- 2 files changed, 50 insertions(+), 71 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 9df94b32a2add..8d8c00789e72c 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1787,12 +1787,6 @@ static struct iommu_domain *__iommu_alloc_identity_domain(struct device *dev) domain = ops->domain_alloc_identity(dev); if (IS_ERR(domain)) return domain; - } else if (ops->domain_alloc) { - domain = ops->domain_alloc(IOMMU_DOMAIN_IDENTITY); - if (!domain) - return ERR_PTR(-ENOMEM); - if (IS_ERR(domain)) - return domain; } else { return ERR_PTR(-EOPNOTSUPP); } diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 55a2188197c62..ecd41fb03e5a5 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -63,7 +63,6 @@ struct viommu_mapping { struct viommu_domain { struct iommu_domain domain; struct viommu_dev *viommu; - struct mutex mutex; /* protects viommu pointer */ unsigned int id; u32 map_flags; @@ -97,6 +96,8 @@ struct viommu_event { }; }; +static struct viommu_domain viommu_identity_domain; + #define to_viommu_domain(domain) \ container_of(domain, struct viommu_domain, domain) @@ -653,65 +654,45 @@ static void viommu_event_handler(struct virtqueue *vq) /* IOMMU API */ -static struct iommu_domain *viommu_domain_alloc(unsigned type) +static struct iommu_domain *viommu_domain_alloc_paging(struct device *dev) { + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct viommu_dev *viommu = vdev->viommu; + unsigned long viommu_page_size; struct viommu_domain *vdomain; - - if (type != IOMMU_DOMAIN_UNMANAGED && - type != IOMMU_DOMAIN_DMA && - type != IOMMU_DOMAIN_IDENTITY) - return NULL; - - vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); - if (!vdomain) - return NULL; - - mutex_init(&vdomain->mutex); - spin_lock_init(&vdomain->mappings_lock); - vdomain->mappings = RB_ROOT_CACHED; - - return &vdomain->domain; -} - -static int viommu_domain_finalise(struct viommu_endpoint *vdev, - struct iommu_domain *domain) -{ int ret; - unsigned long viommu_page_size; - struct viommu_dev *viommu = vdev->viommu; - struct viommu_domain *vdomain = to_viommu_domain(domain); viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap); if (viommu_page_size > PAGE_SIZE) { dev_err(vdev->dev, "granule 0x%lx larger than system page size 0x%lx\n", viommu_page_size, PAGE_SIZE); - return -ENODEV; + return ERR_PTR(-ENODEV); } + vdomain = kzalloc(sizeof(*vdomain), GFP_KERNEL); + if (!vdomain) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&vdomain->mappings_lock); + vdomain->mappings = RB_ROOT_CACHED; + ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, viommu->last_domain, GFP_KERNEL); - if (ret < 0) - return ret; + if (ret < 0) { + kfree(vdomain); + return ERR_PTR(ret); + } - vdomain->id = (unsigned int)ret; + vdomain->id = (unsigned int)ret; - domain->pgsize_bitmap = viommu->pgsize_bitmap; - domain->geometry = viommu->geometry; + vdomain->domain.pgsize_bitmap = viommu->pgsize_bitmap; + vdomain->domain.geometry = viommu->geometry; - vdomain->map_flags = viommu->map_flags; - vdomain->viommu = viommu; + vdomain->map_flags = viommu->map_flags; + vdomain->viommu = viommu; - if (domain->type == IOMMU_DOMAIN_IDENTITY) { - ret = viommu_domain_map_identity(vdev, vdomain); - if (ret) { - ida_free(&viommu->domain_ids, vdomain->id); - vdomain->viommu = NULL; - return ret; - } - } - - return 0; + return &vdomain->domain; } static void viommu_domain_free(struct iommu_domain *domain) @@ -727,6 +708,28 @@ static void viommu_domain_free(struct iommu_domain *domain) kfree(vdomain); } +static struct iommu_domain *viommu_domain_alloc_identity(struct device *dev) +{ + struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); + struct iommu_domain *domain; + int ret; + + if (virtio_has_feature(vdev->viommu->vdev, + VIRTIO_IOMMU_F_BYPASS_CONFIG)) + return &viommu_identity_domain.domain; + + domain = viommu_domain_alloc_paging(dev); + if (IS_ERR(domain)) + return domain; + + ret = viommu_domain_map_identity(vdev, to_viommu_domain(domain)); + if (ret) { + viommu_domain_free(domain); + return ERR_PTR(ret); + } + return domain; +} + static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) { int ret = 0; @@ -734,20 +737,8 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) struct viommu_endpoint *vdev = dev_iommu_priv_get(dev); struct viommu_domain *vdomain = to_viommu_domain(domain); - mutex_lock(&vdomain->mutex); - if (!vdomain->viommu) { - /* - * Properly initialize the domain now that we know which viommu - * owns it. - */ - ret = viommu_domain_finalise(vdev, domain); - } else if (vdomain->viommu != vdev->viommu) { - ret = -EINVAL; - } - mutex_unlock(&vdomain->mutex); - - if (ret) - return ret; + if (vdomain->viommu != vdev->viommu) + return -EINVAL; /* * In the virtio-iommu device, when attaching the endpoint to a new @@ -1096,9 +1087,9 @@ static bool viommu_capable(struct device *dev, enum iommu_cap cap) } static struct iommu_ops viommu_ops = { - .identity_domain = &viommu_identity_domain.domain, .capable = viommu_capable, - .domain_alloc = viommu_domain_alloc, + .domain_alloc_identity = viommu_domain_alloc_identity, + .domain_alloc_paging = viommu_domain_alloc_paging, .probe_device = viommu_probe_device, .release_device = viommu_release_device, .device_group = viommu_device_group, @@ -1224,12 +1215,6 @@ static int viommu_probe(struct virtio_device *vdev) if (virtio_has_feature(viommu->vdev, VIRTIO_IOMMU_F_BYPASS_CONFIG)) { viommu->identity_domain_id = viommu->first_domain; viommu->first_domain++; - } else { - /* - * Assume the VMM is sensible and it either supports bypass on - * all instances or no instances. - */ - viommu_ops.identity_domain = NULL; } viommu_ops.pgsize_bitmap = viommu->pgsize_bitmap; From ec970c520bea37cce04f0afaf18cbc2efc56ffca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:50 -0300 Subject: [PATCH 069/147] iommu: Do not call domain_alloc() in iommu_sva_domain_alloc() No driver implements SVA under domain_alloc() anymore, this is dead code. Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/4-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit a4672d0fe17dd2b5b2f485ae6c98990164e874eb) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu-sva.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index ab18bc494eefd..9dcf2014da4d9 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -299,15 +299,12 @@ static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, const struct iommu_ops *ops = dev_iommu_ops(dev); struct iommu_domain *domain; - if (ops->domain_alloc_sva) { - domain = ops->domain_alloc_sva(dev, mm); - if (IS_ERR(domain)) - return domain; - } else { - domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); - if (!domain) - return ERR_PTR(-ENOMEM); - } + if (!ops->domain_alloc_sva) + return ERR_PTR(-EOPNOTSUPP); + + domain = ops->domain_alloc_sva(dev, mm); + if (IS_ERR(domain)) + return domain; domain->type = IOMMU_DOMAIN_SVA; domain->cookie_type = IOMMU_COOKIE_SVA; From b69a401e3b7ed59086b2b272e1dd08fa8ccd0fef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 13:35:51 -0300 Subject: [PATCH 070/147] iommu: Hide ops.domain_alloc behind CONFIG_FSL_PAMU fsl_pamu is the last user of domain_alloc(), and it is using it to create something weird that doesn't really fit into the iommu subsystem architecture. It is a not a paging domain since it doesn't have any map/unmap ops. It may be some special kind of identity domain. For now just leave it as is. Wrap it's definition in CONFIG_FSL_PAMU to discourage any new drivers from attempting to use it. Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/5-v4-ff5fb6b03bd1+288-iommu_virtio_domains_jgg@nvidia.com Signed-off-by: Joerg Roedel (cherry picked from commit 21c03574df19f0d77cb2e4d28bc02c79b21e656a) Signed-off-by: Nirmoy Das --- drivers/iommu/iommu.c | 2 ++ include/linux/iommu.h | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 8d8c00789e72c..c0430bb1c73ab 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2177,8 +2177,10 @@ __iommu_paging_domain_alloc_flags(struct device *dev, unsigned int type, domain = ops->domain_alloc_paging(dev); else if (ops->domain_alloc_paging_flags) domain = ops->domain_alloc_paging_flags(dev, flags, NULL); +#if IS_ENABLED(CONFIG_FSL_PAMU) else if (ops->domain_alloc && !flags) domain = ops->domain_alloc(IOMMU_DOMAIN_UNMANAGED); +#endif else return ERR_PTR(-EOPNOTSUPP); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 91b16c69d0614..1803d5a924ce8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -569,9 +569,7 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * op is allocated in the iommu driver and freed by the caller after * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate and return an iommu domain if success. Otherwise - * NULL is returned. The domain is not fully initialized until - * the caller iommu_domain_alloc() returns. + * @domain_alloc: Do not use in new drivers * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to * use identity_domain instead. This should only be used * if dynamic logic is necessary. @@ -632,7 +630,9 @@ struct iommu_ops { void *(*hw_info)(struct device *dev, u32 *length, u32 *type); /* Domain allocation and freeing by the iommu driver */ +#if IS_ENABLED(CONFIG_FSL_PAMU) struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); +#endif struct iommu_domain *(*domain_alloc_identity)(struct device *dev); struct iommu_domain *(*domain_alloc_paging_flags)( struct device *dev, u32 flags, From 7888476636be7d5e27c420bd9d814ac530d9d0c1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:13 -0700 Subject: [PATCH 071/147] iommufd: Apply obvious cosmetic fixes Run clang-format but exclude those not so obvious ones, which leaves us: - Align indentations - Add missing spaces - Remove unnecessary spaces - Remove unnecessary line wrappings Link: https://patch.msgid.link/r/9132e1ab45690ab1959c66bbb51ac5536a635388.1749882255.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit ea92128fe7f6eef6ee5fcaaed521b1b2b5ab7c9a linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 3 +-- drivers/iommu/iommufd/hw_pagetable.c | 6 ++---- drivers/iommu/iommufd/io_pagetable.c | 3 +-- drivers/iommu/iommufd/io_pagetable.h | 2 +- drivers/iommu/iommufd/iommufd_private.h | 6 ++---- drivers/iommu/iommufd/iova_bitmap.c | 1 - drivers/iommu/iommufd/main.c | 6 ++---- drivers/iommu/iommufd/pages.c | 9 ++++----- drivers/iommu/iommufd/selftest.c | 24 +++++++++++------------- include/linux/iommufd.h | 5 +++-- 10 files changed, 27 insertions(+), 38 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 2111bad72c720..16c17ae1f5f29 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -473,8 +473,7 @@ iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid) lockdep_assert_held(&idev->igroup->lock); - handle = - iommu_attach_handle_get(idev->igroup->group, pasid, 0); + handle = iommu_attach_handle_get(idev->igroup->group, pasid, 0); if (IS_ERR(handle)) return NULL; return to_iommufd_handle(handle); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 487779470261a..8565a6f596b23 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -309,10 +309,8 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, refcount_inc(&viommu->obj.users); hwpt_nested->parent = viommu->hwpt; - hwpt->domain = - viommu->ops->alloc_domain_nested(viommu, - flags & ~IOMMU_HWPT_FAULT_ID_VALID, - user_data); + hwpt->domain = viommu->ops->alloc_domain_nested( + viommu, flags & ~IOMMU_HWPT_FAULT_ID_VALID, user_data); if (IS_ERR(hwpt->domain)) { rc = PTR_ERR(hwpt->domain); hwpt->domain = NULL; diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 8a790e597e125..13d010f19ed19 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -1410,8 +1410,7 @@ int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) } void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access, - u32 iopt_access_list_id) + struct iommufd_access *access, u32 iopt_access_list_id) { down_write(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 10c928a9a4633..c115a51d93846 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -240,7 +240,7 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start, unsigned long last, struct page **out_pages, unsigned int flags); void iopt_area_remove_access(struct iopt_area *area, unsigned long start, - unsigned long last); + unsigned long last); int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, void *data, unsigned long length, unsigned int flags); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 80e8c76d25f23..10899e2cd34a5 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -266,8 +266,7 @@ struct iommufd_ioas { static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, u32 id) { - return container_of(iommufd_get_object(ictx, id, - IOMMUFD_OBJ_IOAS), + return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS), struct iommufd_ioas, obj); } @@ -455,8 +454,7 @@ struct iommufd_access { int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, - struct iommufd_access *access, - u32 iopt_access_list_id); + struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); struct iommufd_eventq { diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 39a86a4a1d3af..4514575818fc0 100644 --- a/drivers/iommu/iommufd/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -407,7 +407,6 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, update_indexes: if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) { - /* * The attempt to advance the base index to @iova * may fail if it's out of bounds, or pinning the pages diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 3df468f64e7d9..347c56ef44d83 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -102,9 +102,8 @@ static int iommufd_object_dec_wait_shortterm(struct iommufd_ctx *ictx, return 0; if (wait_event_timeout(ictx->destroy_wait, - refcount_read(&to_destroy->shortterm_users) == - 0, - msecs_to_jiffies(60000))) + refcount_read(&to_destroy->shortterm_users) == 0, + msecs_to_jiffies(60000))) return 0; pr_crit("Time out waiting for iommufd object to become free\n"); @@ -539,7 +538,6 @@ static struct miscdevice iommu_misc_dev = { .mode = 0660, }; - static struct miscdevice vfio_misc_dev = { .minor = VFIO_MINOR, .name = "vfio", diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index cdc115864f383..d6680483fdd71 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1360,8 +1360,7 @@ static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, } static struct iopt_pages *iopt_alloc_pages(unsigned long start_byte, - unsigned long length, - bool writable) + unsigned long length, bool writable) { struct iopt_pages *pages; @@ -1401,7 +1400,7 @@ struct iopt_pages *iopt_alloc_user_pages(void __user *uptr, struct iopt_pages *pages; unsigned long end; void __user *uptr_down = - (void __user *) ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); if (check_add_overflow((unsigned long)uptr, length, &end)) return ERR_PTR(-EOVERFLOW); @@ -2184,8 +2183,8 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, * This should be undone through a matching call to iopt_area_remove_access() */ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, - unsigned long last_index, struct page **out_pages, - unsigned int flags) + unsigned long last_index, struct page **out_pages, + unsigned int flags) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 18d9a216eb30d..8cd98a72767d0 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -1179,9 +1179,8 @@ static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, return 0; } -static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, - u32 mockpt_id, unsigned int iotlb_id, - u32 iotlb) +static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd, u32 mockpt_id, + unsigned int iotlb_id, u32 iotlb) { struct mock_iommu_domain_nested *mock_nested; struct iommufd_hw_pagetable *hwpt; @@ -1454,7 +1453,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ - if (length > 16*1024*1024) + if (length > 16 * 1024 * 1024) return -ENOMEM; if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) @@ -1471,7 +1470,7 @@ static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, - &cmd->access_pages.iova); + &cmd->access_pages.iova); npages = (ALIGN(iova + length, PAGE_SIZE) - ALIGN_DOWN(iova, PAGE_SIZE)) / @@ -1547,7 +1546,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, int rc; /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ - if (length > 16*1024*1024) + if (length > 16 * 1024 * 1024) return -ENOMEM; if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | @@ -1573,7 +1572,7 @@ static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, if (flags & MOCK_FLAGS_ACCESS_SYZ) iova = iommufd_test_syz_conv_iova(staccess->access, - &cmd->access_rw.iova); + &cmd->access_rw.iova); rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); if (rc) @@ -1628,7 +1627,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, goto out_put; } - if (copy_from_user(tmp, uptr,DIV_ROUND_UP(max, BITS_PER_BYTE))) { + if (copy_from_user(tmp, uptr, DIV_ROUND_UP(max, BITS_PER_BYTE))) { rc = -EFAULT; goto out_free; } @@ -1664,7 +1663,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd, struct iommu_test_cmd *cmd) { - struct iopf_fault event = { }; + struct iopf_fault event = {}; struct iommufd_device *idev; idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id); @@ -1795,8 +1794,7 @@ static int iommufd_test_pasid_attach(struct iommufd_ucmd *ucmd, rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) - iommufd_device_detach(sobj->idev.idev, - cmd->pasid_attach.pasid); + iommufd_device_detach(sobj->idev.idev, cmd->pasid_attach.pasid); out_sobj: iommufd_put_object(ucmd->ictx, &sobj->obj); @@ -1967,8 +1965,8 @@ int __init iommufd_test_init(void) goto err_bus; rc = iommu_device_register_bus(&mock_iommu.iommu_dev, &mock_ops, - &iommufd_mock_bus_type.bus, - &iommufd_mock_bus_type.nb); + &iommufd_mock_bus_type.bus, + &iommufd_mock_bus_type.nb); if (rc) goto err_sysfs; diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 34b6e6ca4bfa0..498c9a7685065 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -171,8 +171,9 @@ static inline void iommufd_access_unpin_pages(struct iommufd_access *access, { } -static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, - void *data, size_t len, unsigned int flags) +static inline int iommufd_access_rw(struct iommufd_access *access, + unsigned long iova, void *data, size_t len, + unsigned int flags) { return -EOPNOTSUPP; } From 435a2daab02be9db2061919e1c244c7092f5b3e0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:14 -0700 Subject: [PATCH 072/147] iommufd: Drop unused ictx in struct iommufd_vdevice The core code can always get the ictx pointer via vdev->viommu->ictx, thus drop this unused one. Link: https://patch.msgid.link/r/6cbb65e8df433de45b6c3a4bb2c5df09faca8a7c.1749882255.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit 6e235a77219934d24cc356336a811ed19e439765 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_private.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 10899e2cd34a5..a84aec14d3f3c 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -607,7 +607,6 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj); struct iommufd_vdevice { struct iommufd_object obj; - struct iommufd_ctx *ictx; struct iommufd_viommu *viommu; struct device *dev; u64 id; /* per-vIOMMU virtual ID */ From bdca5a3c8f18e48da5963888a0419f7bbb883a65 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:15 -0700 Subject: [PATCH 073/147] iommufd: Use enum iommu_viommu_type for type in struct iommufd_viommu Replace unsigned int, to make it clear. No functional changes. The viommu_alloc iommu op will be deprecated, so don't change that. Link: https://patch.msgid.link/r/6c6ba5c0cd381594f17ae74355872d78d7a022c0.1749882255.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Pranjal Shrivastava Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit fc9c40e3a4faa09dbd643ae1bdaf8ad006c3bc28 linux-next) Signed-off-by: Nirmoy Das --- include/linux/iommufd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 498c9a7685065..ac98e49e44feb 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -101,7 +101,7 @@ struct iommufd_viommu { struct list_head veventqs; struct rw_semaphore veventqs_rwsem; - unsigned int type; + enum iommu_viommu_type type; }; /** From 974bf399d4f48f7b86928eacc204040b2ab1f5f1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:16 -0700 Subject: [PATCH 074/147] iommufd: Use enum iommu_veventq_type for type in struct iommufd_veventq Replace unsigned int, to make it clear. No functional changes. Link: https://patch.msgid.link/r/208a260c100a00667d3799feaad1260745f96c6b.1749882255.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit 62b62a55bd30164f8d256b22a60181085238859d linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_private.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index a84aec14d3f3c..78c49eadaff37 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -532,7 +532,7 @@ struct iommufd_veventq { struct list_head node; /* for iommufd_viommu::veventqs */ struct iommufd_vevent lost_events_header; - unsigned int type; + enum iommu_veventq_type type; unsigned int depth; /* Use common.lock for protection */ @@ -587,7 +587,8 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id) } static inline struct iommufd_veventq * -iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, u32 type) +iommufd_viommu_find_veventq(struct iommufd_viommu *viommu, + enum iommu_veventq_type type) { struct iommufd_veventq *veventq, *next; From c9adfa0f643f5dd955c254b7ce9dc041c9202659 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:17 -0700 Subject: [PATCH 075/147] iommufd: Return EOPNOTSUPP for failures due to driver bugs It's more accurate to report EOPNOTSUPP when an ioctl failed due to driver bug, since there is nothing wrong with the user space side. Link: https://patch.msgid.link/r/623bb6f0e8fdd7b9c5745a2f99f280163f9f1f5a.1749882255.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit 0c6e0ae7a7e49fb0e781d3fbf24004e1b6b586d1 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 2 +- drivers/iommu/iommufd/hw_pagetable.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 16c17ae1f5f29..50c0c41d9229b 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1484,7 +1484,7 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ if (WARN_ON_ONCE(cmd->out_data_type == IOMMU_HW_INFO_TYPE_NONE)) { - rc = -ENODEV; + rc = -EOPNOTSUPP; goto out_free; } } else { diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index 8565a6f596b23..fe789c2dc0c97 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -264,7 +264,7 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { - rc = -EINVAL; + rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; @@ -321,7 +321,7 @@ iommufd_viommu_alloc_hwpt_nested(struct iommufd_viommu *viommu, u32 flags, hwpt->domain->cookie_type = IOMMU_COOKIE_IOMMUFD; if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { - rc = -EINVAL; + rc = -EOPNOTSUPP; goto out_abort; } return hwpt_nested; From 23690a3130627e5898debccc6d0781f9072b5c88 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:18 -0700 Subject: [PATCH 076/147] iommu: Introduce get_viommu_size and viommu_init ops So far, a vIOMMU object has been allocated by IOMMU driver and initialized with the driver-level structure, before it returns to the iommufd core for core-level structure initialization. It has been requiring iommufd core to expose some core structure/helpers in its driver.c file, which result in a size increase of this driver module. Meanwhile, IOMMU drivers are now requiring more vIOMMU-base structures for some advanced feature, such as the existing vDEVICE and a future HW_QUEUE. Initializing a core-structure later than driver-structure gives for-driver helpers some trouble, when they are used by IOMMU driver assuming that the new structure (including core) are fully initialized, for example: core: viommu = ops->viommu_alloc(); driver: // my_viommu is successfully allocated driver: my_viommu = iommufd_viommu_alloc(...); driver: // This may crash if it reads viommu->ictx driver: new = iommufd_new_viommu_helper(my_viommu->core ...); core: viommu->ictx = ucmd->ictx; core: ... To ease such a condition, allow the IOMMU driver to report the size of its vIOMMU structure, let the core allocate a vIOMMU object and initialize the core-level structure first, and then hand it over the driver to initialize its driver-level structure. Thus, this requires two new iommu ops, get_viommu_size and viommu_init, so iommufd core can communicate with drivers to replace the viommu_alloc op: core: viommu = ops->get_viommu_size(); driver: return VIOMMU_STRUCT_SIZE(); core: viommu->ictx = ucmd->ictx; // and others core: rc = ops->viommu_init(); driver: // This is safe now as viommu->ictx is inited driver: new = iommufd_new_viommu_helper(my_viommu->core ...); core: ... This also adds a VIOMMU_STRUCT_SIZE macro, for drivers to use, which would statically sanitize the driver structure. Link: https://patch.msgid.link/r/3ab52c5b622dad476c43b1b1f1636c8b902f1692.1749882255.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit 187f146d5de65d50d90a4f49157d381d8ae32939 linux-next) Signed-off-by: Nirmoy Das --- include/linux/iommu.h | 15 +++++++++++++++ include/linux/iommufd.h | 6 ++++++ 2 files changed, 21 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 1803d5a924ce8..0e9a8f61f6e12 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -14,6 +14,7 @@ #include #include #include +#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) @@ -603,6 +604,16 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * - IOMMU_DOMAIN_DMA: must use a dma domain * - 0: use the default setting * @default_domain_ops: the default ops for domains + * @get_viommu_size: Get the size of a driver-level vIOMMU structure for a given + * @dev corresponding to @viommu_type. Driver should return 0 + * if vIOMMU isn't supported accordingly. It is required for + * driver to use the VIOMMU_STRUCT_SIZE macro to sanitize the + * driver-level vIOMMU structure related to the core one + * @viommu_init: Init the driver-level struct of an iommufd_viommu on a physical + * IOMMU instance @viommu->iommu_dev, as the set of virtualization + * resources shared/passed to user space IOMMU instance. Associate + * it with a nesting @parent_domain. It is required for driver to + * set @viommu->ops pointing to its own viommu_ops * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind * the @dev, as the set of virtualization resources shared/passed * to user space IOMMU instance. And associate it with a nesting @@ -664,6 +675,10 @@ struct iommu_ops { int (*def_domain_type)(struct device *dev); + size_t (*get_viommu_size)(struct device *dev, + enum iommu_viommu_type viommu_type); + int (*viommu_init)(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain); struct iommufd_viommu *(*viommu_alloc)( struct device *dev, struct iommu_domain *parent_domain, struct iommufd_ctx *ictx, unsigned int viommu_type); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index ac98e49e44feb..423e08963d901 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -229,6 +229,12 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, } #endif /* CONFIG_IOMMUFD_DRIVER_CORE */ +#define VIOMMU_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu, \ + ((drv_struct *)NULL)->member))) + /* * Helpers for IOMMU driver to allocate driver structures that will be freed by * the iommufd core. The free op will be called prior to freeing the memory. From 47ee4f49b153b94c99df001a8edc67160e1d00eb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:19 -0700 Subject: [PATCH 077/147] iommufd/viommu: Support get_viommu_size and viommu_init ops To ease the for-driver iommufd APIs, get_viommu_size and viommu_init ops are introduced to replace the viommu_init op. Let the new viommu_init pathway coexist with the old viommu_alloc one. Since the viommu_alloc op and its pathway will be soon deprecated, try to minimize the code difference between them by adding a tentative jump tag. Note that this fails a !viommu->ops case from now on with a WARN_ON_ONCE since a vIOMMU is expected to support an alloc_domain_nested op for now, or some sort of a viommu op in the foreseeable future. This WARN_ON_ONCE can be lifted, if some day there is a use case wanting !viommu->ops. Link: https://patch.msgid.link/r/35c5fa5926be45bda82f5fc87545cd3180ad4c9c.1749882255.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit 63141fa741da27ffe3220a2395229098e7184c98 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/viommu.c | 42 +++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 01df2b985f02a..27a39f524840c 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -21,6 +21,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) struct iommufd_viommu *viommu; struct iommufd_device *idev; const struct iommu_ops *ops; + size_t viommu_size; int rc; if (cmd->flags || cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) @@ -31,11 +32,29 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) return PTR_ERR(idev); ops = dev_iommu_ops(idev->dev); - if (!ops->viommu_alloc) { + if (!ops->get_viommu_size || !ops->viommu_init) { + if (ops->viommu_alloc) + goto get_hwpt_paging; + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + viommu_size = ops->get_viommu_size(idev->dev, cmd->type); + if (!viommu_size) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + + /* + * It is a driver bug for providing a viommu_size smaller than the core + * vIOMMU structure size + */ + if (WARN_ON_ONCE(viommu_size < sizeof(*viommu))) { rc = -EOPNOTSUPP; goto out_put_idev; } +get_hwpt_paging: hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); @@ -47,8 +66,13 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_hwpt; } - viommu = ops->viommu_alloc(idev->dev, hwpt_paging->common.domain, - ucmd->ictx, cmd->type); + if (ops->viommu_alloc) + viommu = ops->viommu_alloc(idev->dev, + hwpt_paging->common.domain, + ucmd->ictx, cmd->type); + else + viommu = (struct iommufd_viommu *)_iommufd_object_alloc( + ucmd->ictx, viommu_size, IOMMUFD_OBJ_VIOMMU); if (IS_ERR(viommu)) { rc = PTR_ERR(viommu); goto out_put_hwpt; @@ -68,6 +92,18 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) */ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); + if (!ops->viommu_alloc) { + rc = ops->viommu_init(viommu, hwpt_paging->common.domain); + if (rc) + goto out_abort; + } + + /* It is a driver bug that viommu->ops isn't filled */ + if (WARN_ON_ONCE(!viommu->ops)) { + rc = -EOPNOTSUPP; + goto out_abort; + } + cmd->out_viommu_id = viommu->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) From aa8b24099677d05100f264a08097adcee93d8801 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:20 -0700 Subject: [PATCH 078/147] iommufd/selftest: Drop parent domain from mock_iommu_domain_nested There is no use of this parent domain. Delete the dead code. Note that the s2_parent in struct mock_viommu will be a deadcode too. Yet, keep it because it will be soon used by HW queue objects, i.e. no point in adding it back and forth in such a short window. Besides, keeping it could cover the majority of vIOMMU use cases where a driver-level structure will be larger in size than the core structure. Link: https://patch.msgid.link/r/0f155a7cd71034a498448fe4828fb4aaacdabf95.1749882255.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe (cherry picked from commit 5983d1e7d7586c32605fdffc68e5ff61bf917022 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/selftest.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 8cd98a72767d0..edd2e190fc132 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -135,7 +135,6 @@ to_mock_domain(struct iommu_domain *domain) struct mock_iommu_domain_nested { struct iommu_domain domain; struct mock_viommu *mock_viommu; - struct mock_iommu_domain *parent; u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; }; @@ -415,7 +414,6 @@ mock_domain_alloc_nested(struct device *dev, struct iommu_domain *parent, mock_nested = __mock_domain_alloc_nested(user_data); if (IS_ERR(mock_nested)) return ERR_CAST(mock_nested); - mock_nested->parent = mock_parent; return &mock_nested->domain; } @@ -653,7 +651,6 @@ mock_viommu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, if (IS_ERR(mock_nested)) return ERR_CAST(mock_nested); mock_nested->mock_viommu = mock_viommu; - mock_nested->parent = mock_viommu->s2_parent; return &mock_nested->domain; } From c94dd42b5cf316cff8d198d24c7ef5edbe89e63f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:21 -0700 Subject: [PATCH 079/147] iommufd/selftest: Replace mock_viommu_alloc with mock_viommu_init To ease the for-driver iommufd APIs, get_viommu_size and viommu_init ops are introduced. Sanitize the inputs and report the size of struct mock_viommu on success, in mock_get_viommu_size(). The core will ensure the viommu_type is set to the core vIOMMU object, so simply init the driver part in mock_viommu_init(). Remove the mock_viommu_alloc, completing the replacement. Link: https://patch.msgid.link/r/993beabbb0bc9705d979a92801ea5ed5996a34eb.1749882255.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe (cherry picked from commit 683cff7c3bf4495c2378ede5a3601271958c08fe linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/selftest.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index edd2e190fc132..9de7ae2a6342b 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -730,25 +730,23 @@ static struct iommufd_viommu_ops mock_viommu_ops = { .cache_invalidate = mock_viommu_cache_invalidate, }; -static struct iommufd_viommu *mock_viommu_alloc(struct device *dev, - struct iommu_domain *domain, - struct iommufd_ctx *ictx, - unsigned int viommu_type) +static size_t mock_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type) { - struct mock_iommu_device *mock_iommu = - iommu_get_iommu_dev(dev, struct mock_iommu_device, iommu_dev); - struct mock_viommu *mock_viommu; - if (viommu_type != IOMMU_VIOMMU_TYPE_SELFTEST) - return ERR_PTR(-EOPNOTSUPP); + return 0; + return VIOMMU_STRUCT_SIZE(struct mock_viommu, core); +} - mock_viommu = iommufd_viommu_alloc(ictx, struct mock_viommu, core, - &mock_viommu_ops); - if (IS_ERR(mock_viommu)) - return ERR_CAST(mock_viommu); +static int mock_viommu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain) +{ + struct mock_iommu_device *mock_iommu = container_of( + viommu->iommu_dev, struct mock_iommu_device, iommu_dev); refcount_inc(&mock_iommu->users); - return &mock_viommu->core; + viommu->ops = &mock_viommu_ops; + return 0; } static const struct iommu_ops mock_ops = { @@ -770,7 +768,8 @@ static const struct iommu_ops mock_ops = { .dev_enable_feat = mock_dev_enable_feat, .dev_disable_feat = mock_dev_disable_feat, .user_pasid_table = true, - .viommu_alloc = mock_viommu_alloc, + .get_viommu_size = mock_get_viommu_size, + .viommu_init = mock_viommu_init, .default_domain_ops = &(struct iommu_domain_ops){ .free = mock_domain_free, From c629ee04d5d7a4e9376c9f43c47583e33da375d8 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:22 -0700 Subject: [PATCH 080/147] iommu/arm-smmu-v3: Replace arm_vsmmu_alloc with arm_vsmmu_init To ease the for-driver iommufd APIs, get_viommu_size and viommu_init ops are introduced. Sanitize the inputs and report the size of struct arm_vsmmu on success, in arm_smmu_get_viommu_size(). Place the type sanity at the last, becase there will be soon an impl level get_viommu_size op, which will require the same sanity tests prior. It can simply insert a piece of code in front of the IOMMU_VIOMMU_TYPE_ARM_SMMUV3 sanity. The core will ensure the viommu_type is set to the core vIOMMU object, and pass in the same dev pointer, so arm_vsmmu_init() won't need to repeat the same sanity tests but to simply init the arm_vsmmu struct. Remove the arm_vsmmu_alloc, completing the replacement. Link: https://patch.msgid.link/r/64e4b4c33acd26e1bd676e077be80e00fb63f17c.1749882255.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Acked-by: Will Deacon Signed-off-by: Jason Gunthorpe (cherry picked from commit 3961f2f5daccf4b54d499d7e155a1b46d17d385a linux-next) Signed-off-by: Nirmoy Das --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 46 ++++++++++--------- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 11 +++-- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index e4fd8d522af88..9f59c95a254cc 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -382,25 +382,14 @@ static const struct iommufd_viommu_ops arm_vsmmu_ops = { .cache_invalidate = arm_vsmmu_cache_invalidate, }; -struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, - struct iommu_domain *parent, - struct iommufd_ctx *ictx, - unsigned int viommu_type) +size_t arm_smmu_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type) { - struct arm_smmu_device *smmu = - iommu_get_iommu_dev(dev, struct arm_smmu_device, iommu); struct arm_smmu_master *master = dev_iommu_priv_get(dev); - struct arm_smmu_domain *s2_parent = to_smmu_domain(parent); - struct arm_vsmmu *vsmmu; - - if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) - return ERR_PTR(-EOPNOTSUPP); + struct arm_smmu_device *smmu = master->smmu; if (!(smmu->features & ARM_SMMU_FEAT_NESTING)) - return ERR_PTR(-EOPNOTSUPP); - - if (s2_parent->smmu != master->smmu) - return ERR_PTR(-EINVAL); + return 0; /* * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW @@ -408,7 +397,7 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, * any change to remove this. */ if (WARN_ON(smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) - return ERR_PTR(-EOPNOTSUPP); + return 0; /* * Must support some way to prevent the VM from bypassing the cache @@ -420,19 +409,32 @@ struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, */ if (!arm_smmu_master_canwbs(master) && !(smmu->features & ARM_SMMU_FEAT_S2FWB)) - return ERR_PTR(-EOPNOTSUPP); + return 0; - vsmmu = iommufd_viommu_alloc(ictx, struct arm_vsmmu, core, - &arm_vsmmu_ops); - if (IS_ERR(vsmmu)) - return ERR_CAST(vsmmu); + if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) + return 0; + + return VIOMMU_STRUCT_SIZE(struct arm_vsmmu, core); +} + +int arm_vsmmu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain) +{ + struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); + struct arm_smmu_device *smmu = + container_of(viommu->iommu_dev, struct arm_smmu_device, iommu); + struct arm_smmu_domain *s2_parent = to_smmu_domain(parent_domain); + + if (s2_parent->smmu != smmu) + return -EINVAL; vsmmu->smmu = smmu; vsmmu->s2_parent = s2_parent; /* FIXME Move VMID allocation from the S2 domain allocation to here */ vsmmu->vmid = s2_parent->s2_cfg.vmid; - return &vsmmu->core; + viommu->ops = &arm_vsmmu_ops; + return 0; } int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0826b6bdf327f..b8a2f17712671 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -3674,7 +3674,8 @@ static struct iommu_ops arm_smmu_ops = { .dev_disable_feat = arm_smmu_dev_disable_feature, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, - .viommu_alloc = arm_vsmmu_alloc, + .get_viommu_size = arm_smmu_get_viommu_size, + .viommu_init = arm_vsmmu_init, .user_pasid_table = 1, .pgsize_bitmap = -1UL, /* Restricted during device attach */ .owner = THIS_MODULE, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index dd1ad56ce8639..8e624c7d1b9f3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1059,18 +1059,19 @@ struct arm_vsmmu { #if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); -struct iommufd_viommu *arm_vsmmu_alloc(struct device *dev, - struct iommu_domain *parent, - struct iommufd_ctx *ictx, - unsigned int viommu_type); +size_t arm_smmu_get_viommu_size(struct device *dev, + enum iommu_viommu_type viommu_type); +int arm_vsmmu_init(struct iommufd_viommu *viommu, + struct iommu_domain *parent_domain); int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, struct arm_smmu_nested_domain *nested_domain); void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); #else +#define arm_smmu_get_viommu_size NULL #define arm_smmu_hw_info NULL -#define arm_vsmmu_alloc NULL +#define arm_vsmmu_init NULL static inline int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, From 839d509f31cde69e06cd3e96f470f78019ffee3e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:23 -0700 Subject: [PATCH 081/147] iommu: Deprecate viommu_alloc op To ease the for-driver iommufd APIs, get_viommu_size and viommu_init ops are introduced. Now, those existing vIOMMU supported drivers implemented these two ops, replacing the viommu_alloc one. So, there is no use of it. Remove it from the headers and the viommu core. Link: https://patch.msgid.link/r/5b32d4499d7ed02a63e57a293c11b642d226ef8d.1749882255.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit f842ea208e43066c43e5e91e20fe8ce600df7055 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/viommu.c | 20 +++++--------------- include/linux/iommu.h | 11 ----------- include/linux/iommufd.h | 18 ------------------ 3 files changed, 5 insertions(+), 44 deletions(-) diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 27a39f524840c..044e3ef06e0f4 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -33,8 +33,6 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) ops = dev_iommu_ops(idev->dev); if (!ops->get_viommu_size || !ops->viommu_init) { - if (ops->viommu_alloc) - goto get_hwpt_paging; rc = -EOPNOTSUPP; goto out_put_idev; } @@ -54,7 +52,6 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } -get_hwpt_paging: hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); if (IS_ERR(hwpt_paging)) { rc = PTR_ERR(hwpt_paging); @@ -66,13 +63,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_hwpt; } - if (ops->viommu_alloc) - viommu = ops->viommu_alloc(idev->dev, - hwpt_paging->common.domain, - ucmd->ictx, cmd->type); - else - viommu = (struct iommufd_viommu *)_iommufd_object_alloc( - ucmd->ictx, viommu_size, IOMMUFD_OBJ_VIOMMU); + viommu = (struct iommufd_viommu *)_iommufd_object_alloc( + ucmd->ictx, viommu_size, IOMMUFD_OBJ_VIOMMU); if (IS_ERR(viommu)) { rc = PTR_ERR(viommu); goto out_put_hwpt; @@ -92,11 +84,9 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) */ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); - if (!ops->viommu_alloc) { - rc = ops->viommu_init(viommu, hwpt_paging->common.domain); - if (rc) - goto out_abort; - } + rc = ops->viommu_init(viommu, hwpt_paging->common.domain); + if (rc) + goto out_abort; /* It is a driver bug that viommu->ops isn't filled */ if (WARN_ON_ONCE(!viommu->ops)) { diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 0e9a8f61f6e12..7928cbe56ef64 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -614,14 +614,6 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * resources shared/passed to user space IOMMU instance. Associate * it with a nesting @parent_domain. It is required for driver to * set @viommu->ops pointing to its own viommu_ops - * @viommu_alloc: Allocate an iommufd_viommu on a physical IOMMU instance behind - * the @dev, as the set of virtualization resources shared/passed - * to user space IOMMU instance. And associate it with a nesting - * @parent_domain. The @viommu_type must be defined in the header - * include/uapi/linux/iommufd.h - * It is required to call iommufd_viommu_alloc() helper for - * a bundled allocation of the core and the driver structures, - * using the given @ictx pointer. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity @@ -679,9 +671,6 @@ struct iommu_ops { enum iommu_viommu_type viommu_type); int (*viommu_init)(struct iommufd_viommu *viommu, struct iommu_domain *parent_domain); - struct iommufd_viommu *(*viommu_alloc)( - struct device *dev, struct iommu_domain *parent_domain, - struct iommufd_ctx *ictx, unsigned int viommu_type); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 423e08963d901..bf41b242b9f68 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -234,22 +234,4 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu, \ ((drv_struct *)NULL)->member))) - -/* - * Helpers for IOMMU driver to allocate driver structures that will be freed by - * the iommufd core. The free op will be called prior to freeing the memory. - */ -#define iommufd_viommu_alloc(ictx, drv_struct, member, viommu_ops) \ - ({ \ - drv_struct *ret; \ - \ - static_assert(__same_type(struct iommufd_viommu, \ - ((drv_struct *)NULL)->member)); \ - static_assert(offsetof(drv_struct, member.obj) == 0); \ - ret = (drv_struct *)_iommufd_object_alloc( \ - ictx, sizeof(drv_struct), IOMMUFD_OBJ_VIOMMU); \ - if (!IS_ERR(ret)) \ - ret->member.ops = viommu_ops; \ - ret; \ - }) #endif From 0439869f26526f14850cdd43998bb5b08298869c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:24 -0700 Subject: [PATCH 082/147] iommufd: Move _iommufd_object_alloc out of driver.c Now, all driver structures will be allocated by the core, i.e. no longer a need of driver calling _iommufd_object_alloc. Thus, move it back. Before: text data bss dec hex filename 3024 180 0 3204 c84 drivers/iommu/iommufd/driver.o 9074 610 64 9748 2614 drivers/iommu/iommufd/main.o After: text data bss dec hex filename 2665 164 0 2829 b0d drivers/iommu/iommufd/driver.o 9410 618 64 10092 276c drivers/iommu/iommufd/main.o Link: https://patch.msgid.link/r/79e630c7b911930cf36e3c8a775a04e66c528d65.1749882255.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Jason Gunthorpe (cherry picked from commit 17a93473a552fc0ffdfb04e69a26946afd4a046a linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 33 ------------------------- drivers/iommu/iommufd/iommufd_private.h | 4 +++ drivers/iommu/iommufd/main.c | 32 ++++++++++++++++++++++++ include/linux/iommufd.h | 10 -------- 4 files changed, 36 insertions(+), 43 deletions(-) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 922cd1fe7ec20..2fee399a148ef 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -3,39 +3,6 @@ */ #include "iommufd_private.h" -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type) -{ - struct iommufd_object *obj; - int rc; - - obj = kzalloc(size, GFP_KERNEL_ACCOUNT); - if (!obj) - return ERR_PTR(-ENOMEM); - obj->type = type; - /* Starts out bias'd by 1 until it is removed from the xarray */ - refcount_set(&obj->shortterm_users, 1); - refcount_set(&obj->users, 1); - - /* - * Reserve an ID in the xarray but do not publish the pointer yet since - * the caller hasn't initialized it yet. Once the pointer is published - * in the xarray and visible to other threads we can't reliably destroy - * it anymore, so the caller must complete all errorable operations - * before calling iommufd_object_finalize(). - */ - rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, - GFP_KERNEL_ACCOUNT); - if (rc) - goto out_free; - return obj; -out_free: - kfree(obj); - return ERR_PTR(rc); -} -EXPORT_SYMBOL_NS_GPL(_iommufd_object_alloc, "IOMMUFD"); - /* Caller should xa_lock(&viommu->vdevs) to protect the return value */ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 78c49eadaff37..5f3542c0222cb 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -230,6 +230,10 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); + #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 347c56ef44d83..85ad2853da0b7 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -29,6 +29,38 @@ struct iommufd_object_ops { static const struct iommufd_object_ops iommufd_object_ops[]; static struct miscdevice vfio_misc_dev; +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + /* Starts out bias'd by 1 until it is removed from the xarray */ + refcount_set(&obj->shortterm_users, 1); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, xa_limit_31b, + GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} + /* * Allow concurrent access to the object. * diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index bf41b242b9f68..2d1bf2f97ee31 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -190,9 +190,6 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) #endif /* CONFIG_IOMMUFD */ #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) -struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, - size_t size, - enum iommufd_object_type type); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, @@ -201,13 +198,6 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu, enum iommu_veventq_type type, void *event_data, size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ -static inline struct iommufd_object * -_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, - enum iommufd_object_type type) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline struct device * iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { From 485a99cc0082b9ebeabbb0ef13db96a48b193bc3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:25 -0700 Subject: [PATCH 083/147] iommufd: Introduce iommufd_object_alloc_ucmd helper An object allocator needs to call either iommufd_object_finalize() upon a success or iommufd_object_abort_and_destroy() upon an error code. To reduce duplication, store a new_obj in the struct iommufd_ucmd and call iommufd_object_finalize/iommufd_object_abort_and_destroy() accordingly in the main function. Similar to iommufd_object_alloc() and __iommufd_object_alloc(), add a pair of helpers: __iommufd_object_alloc_ucmd() and iommufd_object_alloc_ucmd(). Link: https://patch.msgid.link/r/e7206d4227844887cc8dbf0cc7b0242580fafd9d.1749882255.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Acked-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit c0d498a1b99d5417f19c35ecd98ac0ff73c351a4 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_private.h | 26 +++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 25 ++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 5f3542c0222cb..70013d270e603 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -135,6 +135,7 @@ struct iommufd_ucmd { void __user *ubuffer; u32 user_size; void *cmd; + struct iommufd_object *new_obj; }; int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, @@ -230,6 +231,11 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx, iommufd_object_remove(ictx, obj, obj->id, 0); } +/* + * Callers of these normal object allocators must call iommufd_object_finalize() + * to finalize the object, or call iommufd_object_abort_and_destroy() to revert + * the allocation. + */ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); @@ -246,6 +252,26 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, #define iommufd_object_alloc(ictx, ptr, type) \ __iommufd_object_alloc(ictx, ptr, type, obj) +/* + * Callers of these _ucmd allocators should not call iommufd_object_finalize() + * or iommufd_object_abort_and_destroy(), as the core automatically does that. + */ +struct iommufd_object * +_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, size_t size, + enum iommufd_object_type type); + +#define __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) \ + container_of(_iommufd_object_alloc_ucmd( \ + ucmd, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +#define iommufd_object_alloc_ucmd(ucmd, ptr, type) \ + __iommufd_object_alloc_ucmd(ucmd, ptr, type, obj) + /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 85ad2853da0b7..778694d7c2075 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -61,6 +61,24 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, return ERR_PTR(rc); } +struct iommufd_object *_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *new_obj; + + /* Something is coded wrong if this is hit */ + if (WARN_ON(ucmd->new_obj)) + return ERR_PTR(-EBUSY); + + new_obj = _iommufd_object_alloc(ucmd->ictx, size, type); + if (IS_ERR(new_obj)) + return new_obj; + + ucmd->new_obj = new_obj; + return new_obj; +} + /* * Allow concurrent access to the object. * @@ -448,6 +466,13 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, if (ret) return ret; ret = op->execute(&ucmd); + + if (ucmd.new_obj) { + if (ret) + iommufd_object_abort_and_destroy(ictx, ucmd.new_obj); + else + iommufd_object_finalize(ictx, ucmd.new_obj); + } return ret; } From b9f5a7de4f595d0bd98b6e2b5d7ab5c06a67571e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Fri, 13 Jun 2025 23:35:26 -0700 Subject: [PATCH 084/147] iommufd: Apply the new iommufd_object_alloc_ucmd helper Now the new ucmd-based object allocator eases the finalize/abort routine, apply this to all existing allocators that aren't protected by any lock. Upgrade the for-driver vIOMMU alloctor too, and pass down to all existing viommu_alloc op accordingly. Note that __iommufd_object_alloc_ucmd() builds in some static tests that cover both static_asserts in the iommufd_viommu_alloc(). Thus drop them. Link: https://patch.msgid.link/r/107b24a3b791091bb09c92ffb0081c56c413b26d.1749882255.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe (cherry picked from commit 3e2a9811f6a9cefd310cc33cab73d5435b4a4caa linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/eventq.c | 14 ++++---------- drivers/iommu/iommufd/viommu.c | 24 ++++++------------------ 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/drivers/iommu/iommufd/eventq.c b/drivers/iommu/iommufd/eventq.c index f39cf07973476..269b667152b78 100644 --- a/drivers/iommu/iommufd/eventq.c +++ b/drivers/iommu/iommufd/eventq.c @@ -473,8 +473,8 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) if (cmd->flags) return -EOPNOTSUPP; - fault = __iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT, - common.obj); + fault = __iommufd_object_alloc_ucmd(ucmd, fault, IOMMUFD_OBJ_FAULT, + common.obj); if (IS_ERR(fault)) return PTR_ERR(fault); @@ -483,10 +483,8 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) fdno = iommufd_eventq_init(&fault->common, "[iommufd-pgfault]", ucmd->ictx, &iommufd_fault_fops); - if (fdno < 0) { - rc = fdno; - goto out_abort; - } + if (fdno < 0) + return fdno; cmd->out_fault_id = fault->common.obj.id; cmd->out_fault_fd = fdno; @@ -494,7 +492,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); if (rc) goto out_put_fdno; - iommufd_object_finalize(ucmd->ictx, &fault->common.obj); fd_install(fdno, fault->common.filep); @@ -502,9 +499,6 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd) out_put_fdno: put_unused_fd(fdno); fput(fault->common.filep); -out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &fault->common.obj); - return rc; } diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 044e3ef06e0f4..25ac08fbb52a7 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -63,8 +63,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_hwpt; } - viommu = (struct iommufd_viommu *)_iommufd_object_alloc( - ucmd->ictx, viommu_size, IOMMUFD_OBJ_VIOMMU); + viommu = (struct iommufd_viommu *)_iommufd_object_alloc_ucmd( + ucmd, viommu_size, IOMMUFD_OBJ_VIOMMU); if (IS_ERR(viommu)) { rc = PTR_ERR(viommu); goto out_put_hwpt; @@ -86,23 +86,17 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) rc = ops->viommu_init(viommu, hwpt_paging->common.domain); if (rc) - goto out_abort; + goto out_put_hwpt; /* It is a driver bug that viommu->ops isn't filled */ if (WARN_ON_ONCE(!viommu->ops)) { rc = -EOPNOTSUPP; - goto out_abort; + goto out_put_hwpt; } cmd->out_viommu_id = viommu->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); - if (rc) - goto out_abort; - iommufd_object_finalize(ucmd->ictx, &viommu->obj); - goto out_put_hwpt; -out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj); out_put_hwpt: iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj); out_put_idev: @@ -150,7 +144,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } - vdev = iommufd_object_alloc(ucmd->ictx, vdev, IOMMUFD_OBJ_VDEVICE); + vdev = iommufd_object_alloc_ucmd(ucmd, vdev, IOMMUFD_OBJ_VDEVICE); if (IS_ERR(vdev)) { rc = PTR_ERR(vdev); goto out_put_idev; @@ -165,18 +159,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) curr = xa_cmpxchg(&viommu->vdevs, virt_id, NULL, vdev, GFP_KERNEL); if (curr) { rc = xa_err(curr) ?: -EEXIST; - goto out_abort; + goto out_put_idev; } cmd->out_vdevice_id = vdev->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); - if (rc) - goto out_abort; - iommufd_object_finalize(ucmd->ictx, &vdev->obj); - goto out_put_idev; -out_abort: - iommufd_object_abort_and_destroy(ucmd->ictx, &vdev->obj); out_put_idev: iommufd_put_object(ucmd->ictx, &idev->obj); out_put_viommu: From 27cd044322e9d1ab7293f791ae365560b5f2c6b0 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:53 -0700 Subject: [PATCH 085/147] iommufd: Report unmapped bytes in the error path of iopt_unmap_iova_range There are callers that read the unmapped bytes even when rc != 0. Thus, do not forget to report it in the error path too. Fixes: 8d40205f6093 ("iommufd: Add kAPI toward external drivers for kernel access") Link: https://patch.msgid.link/r/e2b61303bbc008ba1a4e2d7c2a2894749b59fdac.1752126748.git.nicolinc@nvidia.com Cc: stable@vger.kernel.org Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit b23e09f9997771b4b739c1c694fa832b5fa2de02 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/io_pagetable.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 13d010f19ed19..22fc3a12109f0 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -743,8 +743,10 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, iommufd_access_notify_unmap(iopt, area_first, length); /* Something is not responding to unmap requests. */ tries++; - if (WARN_ON(tries > 100)) - return -EDEADLOCK; + if (WARN_ON(tries > 100)) { + rc = -EDEADLOCK; + goto out_unmapped; + } goto again; } @@ -766,6 +768,7 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, out_unlock_iova: up_write(&iopt->iova_rwsem); up_read(&iopt->domains_rwsem); +out_unmapped: if (unmapped) *unmapped = unmapped_bytes; return rc; From b082173f9342de172d4765a5131c7c929d49b6c9 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:54 -0700 Subject: [PATCH 086/147] iommufd: Correct virt_id kdoc at struct iommu_vdevice_alloc The userspace-api iommufd.rst has described it correctly but the uAPI doc was remained uncorrected. Thus, fix it. Link: https://patch.msgid.link/r/2cdcecaf2babee16fda7545ccad4e5bed7a5032d.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit fca02263f27eee093379844ac0fb280bf70e6aed linux-next) Signed-off-by: Nirmoy Das --- include/uapi/linux/iommufd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index e2c04e58a997d..00335df82b1fe 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -995,7 +995,7 @@ struct iommu_viommu_alloc { * @dev_id: The physical device to allocate a virtual instance on the vIOMMU * @out_vdevice_id: Object handle for the vDevice. Pass to IOMMU_DESTORY * @virt_id: Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID - * of AMD IOMMU, and vRID of a nested Intel VT-d to a Context Table + * of AMD IOMMU, and vRID of Intel VT-d * * Allocate a virtual device instance (for a physical device) against a vIOMMU. * This instance holds the device's information (related to its vIOMMU) in a VM. From 8c55209d3ba6235da4777df6b20d404303342235 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:55 -0700 Subject: [PATCH 087/147] iommufd/viommu: Explicitly define vdev->virt_id The "id" is too general to get its meaning easily. Rename it explicitly to "virt_id" and update the kdocs for readability. No functional changes. Link: https://patch.msgid.link/r/1fac22d645e6ee555675726faf3798a68315b044.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit c50a5de2c465232f30c731bc98f564ddb6229377 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 2 +- drivers/iommu/iommufd/iommufd_private.h | 7 ++++++- drivers/iommu/iommufd/viommu.c | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 2fee399a148ef..887719016804c 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -30,7 +30,7 @@ int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, xa_lock(&viommu->vdevs); xa_for_each(&viommu->vdevs, index, vdev) { if (vdev->dev == dev) { - *vdev_id = vdev->id; + *vdev_id = vdev->virt_id; rc = 0; break; } diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 70013d270e603..810a25411e4cf 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -640,7 +640,12 @@ struct iommufd_vdevice { struct iommufd_object obj; struct iommufd_viommu *viommu; struct device *dev; - u64 id; /* per-vIOMMU virtual ID */ + + /* + * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of + * AMD IOMMU, and vRID of Intel VT-d + */ + u64 virt_id; }; #ifdef CONFIG_IOMMUFD_TEST diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 25ac08fbb52a7..bc8796e6684e3 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -111,7 +111,7 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj) struct iommufd_viommu *viommu = vdev->viommu; /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ - xa_cmpxchg(&viommu->vdevs, vdev->id, vdev, NULL, GFP_KERNEL); + xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL); refcount_dec(&viommu->obj.users); put_device(vdev->dev); } @@ -150,7 +150,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } - vdev->id = virt_id; + vdev->virt_id = virt_id; vdev->dev = idev->dev; get_device(idev->dev); vdev->viommu = viommu; From 9a8a025a04d5938c454abf5bf2c8221e067d783f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:56 -0700 Subject: [PATCH 088/147] iommu: Use enum iommu_hw_info_type for type in hw_info op Replace u32 to make it clear. No functional changes. Also simplify the kdoc since the type itself is clear enough. Link: https://patch.msgid.link/r/651c50dee8ab900f691202ef0204cd5a43fdd6a2.1752126748.git.nicolinc@nvidia.com Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 4b57c057f9e6668ae442b19902dab8a73fe7b209 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 3 ++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 3 ++- drivers/iommu/intel/iommu.c | 3 ++- drivers/iommu/iommufd/selftest.c | 3 ++- include/linux/iommu.h | 6 +++--- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 9f59c95a254cc..69bbe39e28de8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -7,7 +7,8 @@ #include "arm-smmu-v3.h" -void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type) +void *arm_smmu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); struct iommu_hw_info_arm_smmuv3 *info; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index 8e624c7d1b9f3..ff2d1e227115b 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1058,7 +1058,8 @@ struct arm_vsmmu { }; #if IS_ENABLED(CONFIG_ARM_SMMU_V3_IOMMUFD) -void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type); +void *arm_smmu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type); size_t arm_smmu_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type); int arm_vsmmu_init(struct iommufd_viommu *viommu, diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3457b59e922b6..3dea3647d4e3f 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4231,7 +4231,8 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, return ret; } -static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) +static void *intel_iommu_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 9de7ae2a6342b..5abbd8d9b15cd 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -268,7 +268,8 @@ static struct iommu_domain mock_blocking_domain = { .ops = &mock_blocking_ops, }; -static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) +static void *mock_domain_hw_info(struct device *dev, u32 *length, + enum iommu_hw_info_type *type) { struct iommu_test_hw_info *info; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7928cbe56ef64..8a39cee7c62c4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -568,8 +568,7 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, * @capable: check capability * @hw_info: report iommu hardware information. The data buffer returned by this * op is allocated in the iommu driver and freed by the caller after - * use. The information type is one of enum iommu_hw_info_type defined - * in include/uapi/linux/iommufd.h. + * use. * @domain_alloc: Do not use in new drivers * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to * use identity_domain instead. This should only be used @@ -630,7 +629,8 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, */ struct iommu_ops { bool (*capable)(struct device *dev, enum iommu_cap); - void *(*hw_info)(struct device *dev, u32 *length, u32 *type); + void *(*hw_info)(struct device *dev, u32 *length, + enum iommu_hw_info_type *type); /* Domain allocation and freeing by the iommu driver */ #if IS_ENABLED(CONFIG_FSL_PAMU) From 7e5e8d9e272fba534f46361eedebb397300f4ad2 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:57 -0700 Subject: [PATCH 089/147] iommu: Add iommu_copy_struct_to_user helper Similar to the iommu_copy_struct_from_user helper receiving data from the user space, add an iommu_copy_struct_to_user helper to report output data back to the user space data pointer. Link: https://patch.msgid.link/r/fa292c2a730aadd77085ec3a8272360c96eabb9c.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 3fcf56a2393b399f289a473181ce6b19f716b59d linux-next) Signed-off-by: Nirmoy Das --- include/linux/iommu.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8a39cee7c62c4..396c7e426c2f7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -563,6 +563,46 @@ iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size, return 0; } +/** + * __iommu_copy_struct_to_user - Report iommu driver specific user space data + * @dst_data: Pointer to a struct iommu_user_data for user space data location + * @src_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @data_type: The data type of the @src_data. Must match with @dst_data.type + * @data_len: Length of current user data structure, i.e. sizeof(struct _src) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int +__iommu_copy_struct_to_user(const struct iommu_user_data *dst_data, + void *src_data, unsigned int data_type, + size_t data_len, size_t min_len) +{ + if (WARN_ON(!dst_data || !src_data)) + return -EINVAL; + if (dst_data->type != data_type) + return -EINVAL; + if (dst_data->len < min_len || data_len < dst_data->len) + return -EINVAL; + return copy_struct_to_user(dst_data->uptr, dst_data->len, src_data, + data_len, NULL); +} + +/** + * iommu_copy_struct_to_user - Report iommu driver specific user space data + * @user_data: Pointer to a struct iommu_user_data for user space data location + * @ksrc: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @data_type: The data type of the @ksrc. Must match with @user_data->type + * @min_last: The last member of the data structure @ksrc points in the initial + * version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_to_user(user_data, ksrc, data_type, min_last) \ + __iommu_copy_struct_to_user(user_data, ksrc, data_type, sizeof(*ksrc), \ + offsetofend(typeof(*ksrc), min_last)) + /** * struct iommu_ops - iommu ops and capabilities * @capable: check capability From 9db0eedb32fcdc4cd8de563f0c376e7fc32fefa2 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:58 -0700 Subject: [PATCH 090/147] iommu: Pass in a driver-level user data structure to viommu_init op The new type of vIOMMU for tegra241-cmdqv allows user space VM to use one of its virtual command queue HW resources exclusively. This requires user space to mmap the corresponding MMIO page from kernel space for direct HW control. To forward the mmap info (offset and length), iommufd should add a driver specific data structure to the IOMMUFD_CMD_VIOMMU_ALLOC ioctl, for driver to output the info during the vIOMMU initialization back to user space. Similar to the existing ioctls and their IOMMU handlers, add a user_data to viommu_init op to bridge between iommufd and drivers. Link: https://patch.msgid.link/r/90bd5637dab7f5507c7a64d2c4826e70431e45a4.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit c3436d42f812faffac94f8fb3fb246ab43ffdffe linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 3 ++- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 3 ++- drivers/iommu/iommufd/selftest.c | 3 ++- drivers/iommu/iommufd/viommu.c | 2 +- include/linux/iommu.h | 3 ++- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 69bbe39e28de8..170d691628487 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -419,7 +419,8 @@ size_t arm_smmu_get_viommu_size(struct device *dev, } int arm_vsmmu_init(struct iommufd_viommu *viommu, - struct iommu_domain *parent_domain) + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data) { struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); struct arm_smmu_device *smmu = diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index ff2d1e227115b..f0b0aae9b2fb9 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1063,7 +1063,8 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, size_t arm_smmu_get_viommu_size(struct device *dev, enum iommu_viommu_type viommu_type); int arm_vsmmu_init(struct iommufd_viommu *viommu, - struct iommu_domain *parent_domain); + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data); int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, struct arm_smmu_nested_domain *nested_domain); void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 5abbd8d9b15cd..17655fc95c264 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -740,7 +740,8 @@ static size_t mock_get_viommu_size(struct device *dev, } static int mock_viommu_init(struct iommufd_viommu *viommu, - struct iommu_domain *parent_domain) + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data) { struct mock_iommu_device *mock_iommu = container_of( viommu->iommu_dev, struct mock_iommu_device, iommu_dev); diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index bc8796e6684e3..2009a421efae2 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -84,7 +84,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) */ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); - rc = ops->viommu_init(viommu, hwpt_paging->common.domain); + rc = ops->viommu_init(viommu, hwpt_paging->common.domain, NULL); if (rc) goto out_put_hwpt; diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 396c7e426c2f7..5ce812fff4ee3 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -710,7 +710,8 @@ struct iommu_ops { size_t (*get_viommu_size)(struct device *dev, enum iommu_viommu_type viommu_type); int (*viommu_init)(struct iommufd_viommu *viommu, - struct iommu_domain *parent_domain); + struct iommu_domain *parent_domain, + const struct iommu_user_data *user_data); const struct iommu_domain_ops *default_domain_ops; unsigned long pgsize_bitmap; From 605fddd1f2ed0585d47b6712f263c36085e94375 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:58:59 -0700 Subject: [PATCH 091/147] iommufd/viommu: Allow driver-specific user data for a vIOMMU object The new type of vIOMMU for tegra241-cmdqv driver needs a driver-specific user data. So, add data_len/uptr to the iommu_viommu_alloc uAPI and pass it in via the viommu_init iommu op. Link: https://patch.msgid.link/r/2315b0e164b355746387e960745ac9154caec124.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Acked-by: Pranjal Shrivastava Acked-by: Alok Tiwari Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 1976cdf61ce9b6f97b5212676a3b9f74c68f6073 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/viommu.c | 8 +++++++- include/uapi/linux/iommufd.h | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 2009a421efae2..c0365849f8491 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -17,6 +17,11 @@ void iommufd_viommu_destroy(struct iommufd_object *obj) int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) { struct iommu_viommu_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_viommu *viommu; struct iommufd_device *idev; @@ -84,7 +89,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd) */ viommu->iommu_dev = __iommu_get_iommu_dev(idev->dev); - rc = ops->viommu_init(viommu, hwpt_paging->common.domain, NULL); + rc = ops->viommu_init(viommu, hwpt_paging->common.domain, + user_data.len ? &user_data : NULL); if (rc) goto out_put_hwpt; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 00335df82b1fe..5eac0430b9dd2 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -965,6 +965,9 @@ enum iommu_viommu_type { * @dev_id: The device's physical IOMMU will be used to back the virtual IOMMU * @hwpt_id: ID of a nesting parent HWPT to associate to * @out_viommu_id: Output virtual IOMMU ID for the allocated object + * @data_len: Length of the type specific data + * @__reserved: Must be 0 + * @data_uptr: User pointer to a driver-specific virtual IOMMU data * * Allocate a virtual IOMMU object, representing the underlying physical IOMMU's * virtualization support that is a security-isolated slice of the real IOMMU HW @@ -985,6 +988,9 @@ struct iommu_viommu_alloc { __u32 dev_id; __u32 hwpt_id; __u32 out_viommu_id; + __u32 data_len; + __u32 __reserved; + __aligned_u64 data_uptr; }; #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC) From 6579bd655cc959f4d1a370c2a48c3f7261b5a90c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:00 -0700 Subject: [PATCH 092/147] iommufd/selftest: Support user_data in mock_viommu_alloc Add a simple user_data for an input-to-output loopback test. Link: https://patch.msgid.link/r/cae4632bb3d98a1efb3b77488fbf81814f2041c6.1752126748.git.nicolinc@nvidia.com Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit afeaf592c1d435b8773471880c6c349506569cac linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_test.h | 13 +++++++++++++ drivers/iommu/iommufd/selftest.c | 15 +++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 1cd7e83941298..fbf9ecb35a137 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -227,6 +227,19 @@ struct iommu_hwpt_invalidate_selftest { #define IOMMU_VIOMMU_TYPE_SELFTEST 0xdeadbeef +/** + * struct iommu_viommu_selftest - vIOMMU data for Mock driver + * (IOMMU_VIOMMU_TYPE_SELFTEST) + * @in_data: Input random data from user space + * @out_data: Output data (matching @in_data) to user space + * + * Simply set @out_data=@in_data for a loopback test + */ +struct iommu_viommu_selftest { + __u32 in_data; + __u32 out_data; +}; + /* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ #define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef #define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 17655fc95c264..020160060e18d 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -745,6 +745,21 @@ static int mock_viommu_init(struct iommufd_viommu *viommu, { struct mock_iommu_device *mock_iommu = container_of( viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct iommu_viommu_selftest data; + int rc; + + if (user_data) { + rc = iommu_copy_struct_from_user( + &data, user_data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); + if (rc) + return rc; + + data.out_data = data.in_data; + rc = iommu_copy_struct_to_user( + user_data, &data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); + if (rc) + return rc; + } refcount_inc(&mock_iommu->users); viommu->ops = &mock_viommu_ops; From c6527961665f43ebe6571246ff182bab61d99c98 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:01 -0700 Subject: [PATCH 093/147] iommufd/selftest: Add coverage for viommu data Extend the existing test_cmd/err_viommu_alloc helpers to accept optional user data. And add a TEST_F for a loopback test. Link: https://patch.msgid.link/r/8ceb64d30e9953f29270a7d341032ca439317271.1752126748.git.nicolinc@nvidia.com Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 0e3e0b0c08e388cd9e05bb4d17534bd36bedc9fe linux-next) Signed-off-by: Nirmoy Das --- tools/testing/selftests/iommu/iommufd.c | 31 ++++++++++++++----- .../selftests/iommu/iommufd_fail_nth.c | 5 +-- tools/testing/selftests/iommu/iommufd_utils.h | 21 ++++++++----- 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 1a8e85afe9aa5..a9dfcce5e1b22 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2688,7 +2688,7 @@ FIXTURE_SETUP(iommufd_viommu) /* Allocate a vIOMMU taking refcount of the parent hwpt */ test_cmd_viommu_alloc(self->device_id, self->hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, &self->viommu_id); /* Allocate a regular nested hwpt */ @@ -2727,24 +2727,27 @@ TEST_F(iommufd_viommu, viommu_negative_tests) if (self->device_id) { /* Negative test -- invalid hwpt (hwpt_id=0) */ test_err_viommu_alloc(ENOENT, device_id, 0, - IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + NULL); /* Negative test -- not a nesting parent hwpt */ test_cmd_hwpt_alloc(device_id, ioas_id, 0, &hwpt_id); test_err_viommu_alloc(EINVAL, device_id, hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + NULL); test_ioctl_destroy(hwpt_id); /* Negative test -- unsupported viommu type */ test_err_viommu_alloc(EOPNOTSUPP, device_id, self->hwpt_id, - 0xdead, NULL); + 0xdead, NULL, 0, NULL); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, self->hwpt_id)); EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, self->viommu_id)); } else { test_err_viommu_alloc(ENOENT, self->device_id, self->hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, NULL); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + NULL); } } @@ -2791,6 +2794,21 @@ TEST_F(iommufd_viommu, viommu_alloc_nested_iopf) } } +TEST_F(iommufd_viommu, viommu_alloc_with_data) +{ + struct iommu_viommu_selftest data = { + .in_data = 0xbeef, + }; + + if (!self->device_id) + SKIP(return, "Skipping test for variant no_viommu"); + + test_cmd_viommu_alloc(self->device_id, self->hwpt_id, + IOMMU_VIOMMU_TYPE_SELFTEST, &data, sizeof(data), + &self->viommu_id); + ASSERT_EQ(data.out_data, data.in_data); +} + TEST_F(iommufd_viommu, vdevice_alloc) { uint32_t viommu_id = self->viommu_id; @@ -3105,8 +3123,7 @@ TEST_F(iommufd_device_pasid, pasid_attach) /* Allocate a regular nested hwpt based on viommu */ test_cmd_viommu_alloc(self->device_id, parent_hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, - &viommu_id); + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, &viommu_id); test_cmd_hwpt_alloc_nested(self->device_id, viommu_id, IOMMU_HWPT_ALLOC_PASID, &nested_hwpt_id[2], diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index e11ec4b121fc3..f7ccf18221089 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -688,8 +688,9 @@ TEST_FAIL_NTH(basic_fail_nth, device) IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; - if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id, - IOMMU_VIOMMU_TYPE_SELFTEST, 0, &viommu_id)) + if (_test_cmd_viommu_alloc(self->fd, idev_id, hwpt_id, 0, + IOMMU_VIOMMU_TYPE_SELFTEST, NULL, 0, + &viommu_id)) return -1; if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 72f6636e5d909..a5d4cbd089bae 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -897,7 +897,8 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 pasid, pasid, fault_fd)) static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, - __u32 type, __u32 flags, __u32 *viommu_id) + __u32 flags, __u32 type, void *data, + __u32 data_len, __u32 *viommu_id) { struct iommu_viommu_alloc cmd = { .size = sizeof(cmd), @@ -905,6 +906,8 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, .type = type, .dev_id = device_id, .hwpt_id = hwpt_id, + .data_uptr = (uint64_t)data, + .data_len = data_len, }; int ret; @@ -916,13 +919,15 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id, return 0; } -#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id) \ - ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ - type, 0, viommu_id)) -#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \ - EXPECT_ERRNO(_errno, \ - _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \ - type, 0, viommu_id)) +#define test_cmd_viommu_alloc(device_id, hwpt_id, type, data, data_len, \ + viommu_id) \ + ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, 0, \ + type, data, data_len, viommu_id)) +#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, data, \ + data_len, viommu_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, 0, \ + type, data, data_len, viommu_id)) static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, __u64 virt_id, __u32 *vdev_id) From 10572ba490f7588f2b087b38024aefc2844502a9 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:02 -0700 Subject: [PATCH 094/147] iommufd/access: Add internal APIs for HW queue to use The new HW queue object, as an internal iommufd object, wants to reuse the struct iommufd_access to pin some iova range in the iopt. However, an access generally takes the refcount of an ictx. So, in such an internal case, a deadlock could happen when the release of the ictx has to wait for the release of the access first when releasing a hw_queue object, which could wait for the release of the ictx that is refcounted: ictx --releases--> hw_queue --releases--> access ^ | |_________________releases________________v To address this, add a set of lightweight internal APIs to unlink the ictx and the access, i.e. no ictx refcounting by the access: ictx --releases--> hw_queue --releases--> access Then, there's no point in setting the access->ictx. So simply define !ictx as an flag for an internal use and add an inline helper. Link: https://patch.msgid.link/r/d8d84bf99cbebec56034b57b966a3d431385b90d.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 1c26c3bbdee11f3fad0c74b8f09aef488dcf4b62 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 59 +++++++++++++++++++++---- drivers/iommu/iommufd/iommufd_private.h | 23 ++++++++++ 2 files changed, 73 insertions(+), 9 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 50c0c41d9229b..2c457046f1807 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1085,7 +1085,39 @@ void iommufd_access_destroy_object(struct iommufd_object *obj) if (access->ioas) WARN_ON(iommufd_access_change_ioas(access, NULL)); mutex_unlock(&access->ioas_lock); - iommufd_ctx_put(access->ictx); + if (!iommufd_access_is_internal(access)) + iommufd_ctx_put(access->ictx); +} + +static struct iommufd_access *__iommufd_access_create(struct iommufd_ctx *ictx) +{ + struct iommufd_access *access; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + mutex_init(&access->ioas_lock); + return access; +} + +struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx) +{ + struct iommufd_access *access; + + access = __iommufd_access_create(ictx); + if (IS_ERR(access)) + return access; + access->iova_alignment = PAGE_SIZE; + + iommufd_object_finalize(ictx, &access->obj); + return access; } /** @@ -1107,11 +1139,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, { struct iommufd_access *access; - /* - * There is no uAPI for the access object, but to keep things symmetric - * use the object infrastructure anyhow. - */ - access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + access = __iommufd_access_create(ictx); if (IS_ERR(access)) return access; @@ -1123,13 +1151,10 @@ iommufd_access_create(struct iommufd_ctx *ictx, else access->iova_alignment = 1; - /* The calling driver is a user until iommufd_access_destroy() */ - refcount_inc(&access->obj.users); access->ictx = ictx; iommufd_ctx_get(ictx); iommufd_object_finalize(ictx, &access->obj); *id = access->obj.id; - mutex_init(&access->ioas_lock); return access; } EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD"); @@ -1174,6 +1199,22 @@ int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD"); +int iommufd_access_attach_internal(struct iommufd_access *access, + struct iommufd_ioas *ioas) +{ + int rc; + + mutex_lock(&access->ioas_lock); + if (WARN_ON(access->ioas)) { + mutex_unlock(&access->ioas_lock); + return -EINVAL; + } + + rc = iommufd_access_change_ioas(access, ioas); + mutex_unlock(&access->ioas_lock); + return rc; +} + int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id) { int rc; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 810a25411e4cf..f30a535015de8 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -487,6 +487,29 @@ void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); +/* iommufd_access for internal use */ +static inline bool iommufd_access_is_internal(struct iommufd_access *access) +{ + return !access->ictx; +} + +struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx); + +static inline void +iommufd_access_destroy_internal(struct iommufd_ctx *ictx, + struct iommufd_access *access) +{ + iommufd_object_destroy_user(ictx, &access->obj); +} + +int iommufd_access_attach_internal(struct iommufd_access *access, + struct iommufd_ioas *ioas); + +static inline void iommufd_access_detach_internal(struct iommufd_access *access) +{ + iommufd_access_detach(access); +} + struct iommufd_eventq { struct iommufd_object obj; struct iommufd_ctx *ictx; From 550924e0e04604f8b186a12ecffde6c40ad2fc5d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:03 -0700 Subject: [PATCH 095/147] iommufd/access: Bypass access->ops->unmap for internal use The access object has been used externally by VFIO mdev devices, allowing them to pin/unpin physical pages (via needs_pin_pages). Meanwhile, a racy unmap can occur in this case, so these devices usually implement an unmap handler, invoked by iommufd_access_notify_unmap(). The new HW queue object will need the same pin/unpin feature, although it (unlike the mdev case) wants to reject any unmap attempt, during its life cycle. Instead, it would not implement an unmap handler. Thus, bypass any access->ops->unmap access call when the access is marked as internal. Also, an area being pinned by an internal access should reject any unmap request. This cannot be done inside iommufd_access_notify_unmap() as it's a per-iopt action. Add a "num_locks" counter in the struct iopt_area, set that in iopt_area_add_access() when the caller is an internal access. Link: https://patch.msgid.link/r/6df9a43febf79c0379091ec59747276ce9d2493b.1752126748.git.nicolinc@nvidia.com Suggested-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 27b77ea5feaa8fcf385ea99ce757982b0ac9d1f0 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 18 ++++++++++++------ drivers/iommu/iommufd/io_pagetable.c | 6 ++++++ drivers/iommu/iommufd/io_pagetable.h | 5 +++-- drivers/iommu/iommufd/pages.c | 14 ++++++++++++-- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 2c457046f1807..4589bbfcc4acb 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1049,7 +1049,7 @@ static int iommufd_access_change_ioas(struct iommufd_access *access, } if (cur_ioas) { - if (access->ops->unmap) { + if (!iommufd_access_is_internal(access) && access->ops->unmap) { mutex_unlock(&access->ioas_lock); access->ops->unmap(access->data, 0, ULONG_MAX); mutex_lock(&access->ioas_lock); @@ -1256,7 +1256,8 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, xa_lock(&ioas->iopt.access_list); xa_for_each(&ioas->iopt.access_list, index, access) { - if (!iommufd_lock_obj(&access->obj)) + if (!iommufd_lock_obj(&access->obj) || + iommufd_access_is_internal(access)) continue; xa_unlock(&ioas->iopt.access_list); @@ -1280,6 +1281,7 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length) { + bool internal = iommufd_access_is_internal(access); struct iopt_area_contig_iter iter; struct io_pagetable *iopt; unsigned long last_iova; @@ -1306,7 +1308,8 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, area, iopt_area_iova_to_index(area, iter.cur_iova), iopt_area_iova_to_index( area, - min(last_iova, iopt_area_last_iova(area)))); + min(last_iova, iopt_area_last_iova(area))), + internal); WARN_ON(!iopt_area_contig_done(&iter)); up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); @@ -1355,6 +1358,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length, struct page **out_pages, unsigned int flags) { + bool internal = iommufd_access_is_internal(access); struct iopt_area_contig_iter iter; struct io_pagetable *iopt; unsigned long last_iova; @@ -1363,7 +1367,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, /* Driver's ops don't support pin_pages */ if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && - WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + WARN_ON(access->iova_alignment != PAGE_SIZE || + (!internal && !access->ops->unmap))) return -EINVAL; if (!length) @@ -1397,7 +1402,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, } rc = iopt_area_add_access(area, index, last_index, out_pages, - flags); + flags, internal); if (rc) goto err_remove; out_pages += last_index - index + 1; @@ -1420,7 +1425,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, iopt_area_iova_to_index(area, iter.cur_iova), iopt_area_iova_to_index( area, min(last_iova, - iopt_area_last_iova(area)))); + iopt_area_last_iova(area))), + internal); } up_read(&iopt->iova_rwsem); mutex_unlock(&access->ioas_lock); diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 22fc3a12109f0..abf4aadca96c0 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -719,6 +719,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, goto out_unlock_iova; } + /* The area is locked by an object that has not been destroyed */ + if (area->num_locks) { + rc = -EBUSY; + goto out_unlock_iova; + } + if (area_first < start || area_last > last) { rc = -ENOENT; goto out_unlock_iova; diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index c115a51d93846..b6064f4ce4af9 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -48,6 +48,7 @@ struct iopt_area { int iommu_prot; bool prevent_access : 1; unsigned int num_accesses; + unsigned int num_locks; }; struct iopt_allowed { @@ -238,9 +239,9 @@ void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, int iopt_area_add_access(struct iopt_area *area, unsigned long start, unsigned long last, struct page **out_pages, - unsigned int flags); + unsigned int flags, bool lock_area); void iopt_area_remove_access(struct iopt_area *area, unsigned long start, - unsigned long last); + unsigned long last, bool unlock_area); int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, void *data, unsigned long length, unsigned int flags); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index d6680483fdd71..939be83e3b3f7 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -2176,6 +2176,7 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, * @last_index: Inclusive last page index * @out_pages: Output list of struct page's representing the PFNs * @flags: IOMMUFD_ACCESS_RW_* flags + * @lock_area: Fail userspace munmap on this area * * Record that an in-kernel access will be accessing the pages, ensure they are * pinned, and return the PFNs as a simple list of 'struct page *'. @@ -2184,7 +2185,7 @@ iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, */ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, unsigned long last_index, struct page **out_pages, - unsigned int flags) + unsigned int flags, bool lock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; @@ -2197,6 +2198,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, access = iopt_pages_get_exact_access(pages, start_index, last_index); if (access) { area->num_accesses++; + if (lock_area) + area->num_locks++; access->users++; iopt_pages_fill_from_xarray(pages, start_index, last_index, out_pages); @@ -2218,6 +2221,8 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, access->node.last = last_index; access->users = 1; area->num_accesses++; + if (lock_area) + area->num_locks++; interval_tree_insert(&access->node, &pages->access_itree); mutex_unlock(&pages->mutex); return 0; @@ -2234,12 +2239,13 @@ int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, * @area: The source of PFNs * @start_index: First page index * @last_index: Inclusive last page index + * @unlock_area: Must match the matching iopt_area_add_access()'s lock_area * * Undo iopt_area_add_access() and unpin the pages if necessary. The caller * must stop using the PFNs before calling this. */ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, - unsigned long last_index) + unsigned long last_index, bool unlock_area) { struct iopt_pages *pages = area->pages; struct iopt_pages_access *access; @@ -2250,6 +2256,10 @@ void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, goto out_unlock; WARN_ON(area->num_accesses == 0 || access->users == 0); + if (unlock_area) { + WARN_ON(area->num_locks == 0); + area->num_locks--; + } area->num_accesses--; access->users--; if (access->users) From 01ec3964706fd84337e82e21ac30e557ddb3707b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:04 -0700 Subject: [PATCH 096/147] iommufd/viommu: Add driver-defined vDEVICE support NVIDIA VCMDQ driver will have a driver-defined vDEVICE structure and do some HW configurations with that. To allow IOMMU drivers to define their own vDEVICE structures, move the struct iommufd_vdevice to the public header and provide a pair of viommu ops, similar to get_viommu_size and viommu_init. Doing this, however, creates a new window between the vDEVICE allocation and its driver-level initialization, during which an abort could happen but it can't invoke a driver destroy function from the struct viommu_ops since the driver structure isn't initialized yet. vIOMMU object doesn't have this problem, since its destroy op is set via the viommu_ops by the driver viommu_init function. Thus, vDEVICE should do something similar: add a destroy function pointer inside the struct iommufd_vdevice instead of the struct iommufd_viommu_ops. Note that there is unlikely a use case for a type dependent vDEVICE, so a static vdevice_size is probably enough for the near term instead of a get_vdevice_size function op. Link: https://patch.msgid.link/r/1e751c01da7863c669314d8e27fdb89eabcf5605.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Lu Baolu Reviewed-by: Vasant Hegde Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit ed42eee797ff3dc889ade63c1dd7c4f430699e23 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_private.h | 12 ---------- drivers/iommu/iommufd/viommu.c | 26 ++++++++++++++++++++- include/linux/iommufd.h | 31 +++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f30a535015de8..00dcf7b915438 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -659,18 +659,6 @@ void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); -struct iommufd_vdevice { - struct iommufd_object obj; - struct iommufd_viommu *viommu; - struct device *dev; - - /* - * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of - * AMD IOMMU, and vRID of Intel VT-d - */ - u64 virt_id; -}; - #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index c0365849f8491..081ee6697a11b 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -116,6 +116,8 @@ void iommufd_vdevice_destroy(struct iommufd_object *obj) container_of(obj, struct iommufd_vdevice, obj); struct iommufd_viommu *viommu = vdev->viommu; + if (vdev->destroy) + vdev->destroy(vdev); /* xa_cmpxchg is okay to fail if alloc failed xa_cmpxchg previously */ xa_cmpxchg(&viommu->vdevs, vdev->virt_id, vdev, NULL, GFP_KERNEL); refcount_dec(&viommu->obj.users); @@ -126,6 +128,7 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) { struct iommu_vdevice_alloc *cmd = ucmd->cmd; struct iommufd_vdevice *vdev, *curr; + size_t vdev_size = sizeof(*vdev); struct iommufd_viommu *viommu; struct iommufd_device *idev; u64 virt_id = cmd->virt_id; @@ -150,7 +153,22 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } - vdev = iommufd_object_alloc_ucmd(ucmd, vdev, IOMMUFD_OBJ_VDEVICE); + if (viommu->ops && viommu->ops->vdevice_size) { + /* + * It is a driver bug for: + * - ops->vdevice_size smaller than the core structure size + * - not implementing a pairing ops->vdevice_init op + */ + if (WARN_ON_ONCE(viommu->ops->vdevice_size < vdev_size || + !viommu->ops->vdevice_init)) { + rc = -EOPNOTSUPP; + goto out_put_idev; + } + vdev_size = viommu->ops->vdevice_size; + } + + vdev = (struct iommufd_vdevice *)_iommufd_object_alloc_ucmd( + ucmd, vdev_size, IOMMUFD_OBJ_VDEVICE); if (IS_ERR(vdev)) { rc = PTR_ERR(vdev); goto out_put_idev; @@ -168,6 +186,12 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) goto out_put_idev; } + if (viommu->ops && viommu->ops->vdevice_init) { + rc = viommu->ops->vdevice_init(vdev); + if (rc) + goto out_put_idev; + } + cmd->out_vdevice_id = vdev->obj.id; rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 2d1bf2f97ee31..bdd10a85eeefa 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -104,6 +104,21 @@ struct iommufd_viommu { enum iommu_viommu_type type; }; +struct iommufd_vdevice { + struct iommufd_object obj; + struct iommufd_viommu *viommu; + struct device *dev; + + /* + * Virtual device ID per vIOMMU, e.g. vSID of ARM SMMUv3, vDeviceID of + * AMD IOMMU, and vRID of Intel VT-d + */ + u64 virt_id; + + /* Clean up all driver-specific parts of an iommufd_vdevice */ + void (*destroy)(struct iommufd_vdevice *vdev); +}; + /** * struct iommufd_viommu_ops - vIOMMU specific operations * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory @@ -120,6 +135,14 @@ struct iommufd_viommu { * array->entry_num to report the number of handled requests. * The data structure of the array entry must be defined in * include/uapi/linux/iommufd.h + * @vdevice_size: Size of the driver-defined vDEVICE structure per this vIOMMU + * @vdevice_init: Initialize the driver-level structure of a vDEVICE object, or + * related HW procedure. @vdev is already initialized by iommufd + * core: vdev->dev and vdev->viommu pointers; vdev->id carries a + * per-vIOMMU virtual ID (refer to struct iommu_vdevice_alloc in + * include/uapi/linux/iommufd.h) + * If driver has a deinit function to revert what vdevice_init op + * does, it should set it to the @vdev->destroy function pointer */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); @@ -128,6 +151,8 @@ struct iommufd_viommu_ops { const struct iommu_user_data *user_data); int (*cache_invalidate)(struct iommufd_viommu *viommu, struct iommu_user_data_array *array); + const size_t vdevice_size; + int (*vdevice_init)(struct iommufd_vdevice *vdev); }; #if IS_ENABLED(CONFIG_IOMMUFD) @@ -224,4 +249,10 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_viommu, \ ((drv_struct *)NULL)->member))) + +#define VDEVICE_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_vdevice, \ + ((drv_struct *)NULL)->member))) #endif From 8985ab6281ebabc9e7a27783f216e4a90840c27f Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:05 -0700 Subject: [PATCH 097/147] iommufd/viommu: Introduce IOMMUFD_OBJ_HW_QUEUE and its related struct Add IOMMUFD_OBJ_HW_QUEUE with an iommufd_hw_queue structure, representing a HW-accelerated queue type of IOMMU's physical queue that can be passed through to a user space VM for direct hardware control, such as: - NVIDIA's Virtual Command Queue - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers Add new viommu ops for iommufd to communicate with IOMMU drivers to fetch supported HW queue structure size and to forward user space ioctls to the IOMMU drivers for initialization/destroy. As the existing HWs, NVIDIA's VCMDQs access the guest memory via physical addresses, while AMD's Buffers access the guest memory via guest physical addresses (i.e. iova of the nesting parent HWPT). Separate two mutually exclusive hw_queue_init and hw_queue_init_phys ops to indicate whether a vIOMMU HW accesses the guest queue in the guest physical space (via iova) or the host physical space (via pa). In a latter case, the iommufd core will validate the physical pages of a given guest queue, to ensure the underlying physical pages are contiguous and pinned. Since this is introduced with NVIDIA's VCMDQs, add hw_queue_init_phys for now, and leave some notes for hw_queue_init in the near future (for AMD). Either NVIDIA's or AMD's HW is a multi-queue model: NVIDIA's will be only one type in enum iommu_hw_queue_type, while AMD's will be three different types (two of which will have multi queues). Compared to letting the core manage multiple queues with three types per vIOMMU object, it'd be easier for the driver to manage that by having three different driver-structure arrays per vIOMMU object. Thus, pass in the index to the init op. Link: https://patch.msgid.link/r/6939b73699e278e60ce167e911b3d9be68882bad.1752126748.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit e2e9360022585c21dc30d2b19f5866c252f40806 linux-next) Signed-off-by: Nirmoy Das --- include/linux/iommufd.h | 42 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/iommufd.h | 9 ++++++++ 2 files changed, 51 insertions(+) diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index bdd10a85eeefa..f13f3ca6adb5e 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -37,6 +37,7 @@ enum iommufd_object_type { IOMMUFD_OBJ_VIOMMU, IOMMUFD_OBJ_VDEVICE, IOMMUFD_OBJ_VEVENTQ, + IOMMUFD_OBJ_HW_QUEUE, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif @@ -119,6 +120,19 @@ struct iommufd_vdevice { void (*destroy)(struct iommufd_vdevice *vdev); }; +struct iommufd_hw_queue { + struct iommufd_object obj; + struct iommufd_viommu *viommu; + + u64 base_addr; /* in guest physical address space */ + size_t length; + + enum iommu_hw_queue_type type; + + /* Clean up all driver-specific parts of an iommufd_hw_queue */ + void (*destroy)(struct iommufd_hw_queue *hw_queue); +}; + /** * struct iommufd_viommu_ops - vIOMMU specific operations * @destroy: Clean up all driver-specific parts of an iommufd_viommu. The memory @@ -143,6 +157,22 @@ struct iommufd_vdevice { * include/uapi/linux/iommufd.h) * If driver has a deinit function to revert what vdevice_init op * does, it should set it to the @vdev->destroy function pointer + * @get_hw_queue_size: Get the size of a driver-defined HW queue structure for a + * given @viommu corresponding to @queue_type. Driver should + * return 0 if HW queue aren't supported accordingly. It is + * required for driver to use the HW_QUEUE_STRUCT_SIZE macro + * to sanitize the driver-level HW queue structure related + * to the core one + * @hw_queue_init_phys: Initialize the driver-level structure of a HW queue that + * is initialized with its core-level structure that holds + * all the info about a guest queue memory. + * Driver providing this op indicates that HW accesses the + * guest queue memory via physical addresses. + * @index carries the logical HW QUEUE ID per vIOMMU in a + * guest VM, for a multi-queue model. @base_addr_pa carries + * the physical location of the guest queue + * If driver has a deinit function to revert what this op + * does, it should set it to the @hw_queue->destroy pointer */ struct iommufd_viommu_ops { void (*destroy)(struct iommufd_viommu *viommu); @@ -153,6 +183,11 @@ struct iommufd_viommu_ops { struct iommu_user_data_array *array); const size_t vdevice_size; int (*vdevice_init)(struct iommufd_vdevice *vdev); + size_t (*get_hw_queue_size)(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type); + /* AMD's HW will add hw_queue_init simply using @hw_queue->base_addr */ + int (*hw_queue_init_phys)(struct iommufd_hw_queue *hw_queue, u32 index, + phys_addr_t base_addr_pa); }; #if IS_ENABLED(CONFIG_IOMMUFD) @@ -255,4 +290,11 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_vdevice, \ ((drv_struct *)NULL)->member))) + +#define HW_QUEUE_STRUCT_SIZE(drv_struct, member) \ + (sizeof(drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(drv_struct, member)) + \ + BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_hw_queue, \ + ((drv_struct *)NULL)->member))) + #endif diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 5eac0430b9dd2..6eb4bf855f782 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1140,4 +1140,13 @@ struct iommu_veventq_alloc { __u32 __reserved; }; #define IOMMU_VEVENTQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VEVENTQ_ALLOC) + +/** + * enum iommu_hw_queue_type - HW Queue Type + * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + */ +enum iommu_hw_queue_type { + IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, +}; + #endif From 44f52d85c538ff21a4a05ef9487c5ae6a7dd6800 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:06 -0700 Subject: [PATCH 098/147] iommufd/viommu: Add IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl Introduce a new IOMMUFD_CMD_HW_QUEUE_ALLOC ioctl for user space to allocate a HW QUEUE object for a vIOMMU specific HW-accelerated queue, e.g.: - NVIDIA's Virtual Command Queue - AMD vIOMMU's Command Buffer, Event Log Buffers, and PPR Log Buffers Since this is introduced with NVIDIA's VCMDQs that access the guest memory in the physical address space, add an iommufd_hw_queue_alloc_phys() helper that will create an access object to the queue memory in the IOAS, to avoid the mappings of the guest memory from being unmapped, during the life cycle of the HW queue object. AMD's HW will need an hw_queue_init op that is mutually exclusive with the hw_queue_init_phys op, and their case will bypass the access part, i.e. no iommufd_hw_queue_alloc_phys() call. Link: https://patch.msgid.link/r/dab4ace747deb46c1fe70a5c663307f46990ae56.1752126748.git.nicolinc@nvidia.com Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 2238ddc2b0560734c2dabb1c1fb4b342b5193625 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_private.h | 2 + drivers/iommu/iommufd/main.c | 6 + drivers/iommu/iommufd/viommu.c | 180 ++++++++++++++++++++++++ include/linux/iommufd.h | 1 + include/uapi/linux/iommufd.h | 34 +++++ 5 files changed, 223 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 00dcf7b915438..1c12e5ae01f9c 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -658,6 +658,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_viommu_destroy(struct iommufd_object *obj); int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_vdevice_destroy(struct iommufd_object *obj); +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_hw_queue_destroy(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 778694d7c2075..4e8dbbfac8900 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -354,6 +354,7 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_fault_alloc fault; struct iommu_hw_info info; + struct iommu_hw_queue_alloc hw_queue; struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; @@ -396,6 +397,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { struct iommu_fault_alloc, out_fault_fd), IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info, __reserved), + IOCTL_OP(IOMMU_HW_QUEUE_ALLOC, iommufd_hw_queue_alloc_ioctl, + struct iommu_hw_queue_alloc, length), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, @@ -559,6 +562,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_FAULT] = { .destroy = iommufd_fault_destroy, }, + [IOMMUFD_OBJ_HW_QUEUE] = { + .destroy = iommufd_hw_queue_destroy, + }, [IOMMUFD_OBJ_HWPT_PAGING] = { .destroy = iommufd_hwpt_paging_destroy, .abort = iommufd_hwpt_paging_abort, diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c index 081ee6697a11b..91339f7999161 100644 --- a/drivers/iommu/iommufd/viommu.c +++ b/drivers/iommu/iommufd/viommu.c @@ -201,3 +201,183 @@ int iommufd_vdevice_alloc_ioctl(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &viommu->obj); return rc; } + +static void iommufd_hw_queue_destroy_access(struct iommufd_ctx *ictx, + struct iommufd_access *access, + u64 base_iova, size_t length) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(base_iova); + u64 offset = base_iova - aligned_iova; + + iommufd_access_unpin_pages(access, aligned_iova, + PAGE_ALIGN(length + offset)); + iommufd_access_detach_internal(access); + iommufd_access_destroy_internal(ictx, access); +} + +void iommufd_hw_queue_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_queue *hw_queue = + container_of(obj, struct iommufd_hw_queue, obj); + + if (hw_queue->destroy) + hw_queue->destroy(hw_queue); + if (hw_queue->access) + iommufd_hw_queue_destroy_access(hw_queue->viommu->ictx, + hw_queue->access, + hw_queue->base_addr, + hw_queue->length); + if (hw_queue->viommu) + refcount_dec(&hw_queue->viommu->obj.users); +} + +/* + * When the HW accesses the guest queue via physical addresses, the underlying + * physical pages of the guest queue must be contiguous. Also, for the security + * concern that IOMMUFD_CMD_IOAS_UNMAP could potentially remove the mappings of + * the guest queue from the nesting parent iopt while the HW is still accessing + * the guest queue memory physically, such a HW queue must require an access to + * pin the underlying pages and prevent that from happening. + */ +static struct iommufd_access * +iommufd_hw_queue_alloc_phys(struct iommu_hw_queue_alloc *cmd, + struct iommufd_viommu *viommu, phys_addr_t *base_pa) +{ + u64 aligned_iova = PAGE_ALIGN_DOWN(cmd->nesting_parent_iova); + u64 offset = cmd->nesting_parent_iova - aligned_iova; + struct iommufd_access *access; + struct page **pages; + size_t max_npages; + size_t length; + size_t i; + int rc; + + /* max_npages = DIV_ROUND_UP(offset + cmd->length, PAGE_SIZE) */ + if (check_add_overflow(offset, cmd->length, &length)) + return ERR_PTR(-ERANGE); + if (check_add_overflow(length, PAGE_SIZE - 1, &length)) + return ERR_PTR(-ERANGE); + max_npages = length / PAGE_SIZE; + /* length needs to be page aligned too */ + length = max_npages * PAGE_SIZE; + + /* + * Use kvcalloc() to avoid memory fragmentation for a large page array. + * Set __GFP_NOWARN to avoid syzkaller blowups + */ + pages = kvcalloc(max_npages, sizeof(*pages), GFP_KERNEL | __GFP_NOWARN); + if (!pages) + return ERR_PTR(-ENOMEM); + + access = iommufd_access_create_internal(viommu->ictx); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_free; + } + + rc = iommufd_access_attach_internal(access, viommu->hwpt->ioas); + if (rc) + goto out_destroy; + + rc = iommufd_access_pin_pages(access, aligned_iova, length, pages, 0); + if (rc) + goto out_detach; + + /* Validate if the underlying physical pages are contiguous */ + for (i = 1; i < max_npages; i++) { + if (page_to_pfn(pages[i]) == page_to_pfn(pages[i - 1]) + 1) + continue; + rc = -EFAULT; + goto out_unpin; + } + + *base_pa = (page_to_pfn(pages[0]) << PAGE_SHIFT) + offset; + kfree(pages); + return access; + +out_unpin: + iommufd_access_unpin_pages(access, aligned_iova, length); +out_detach: + iommufd_access_detach_internal(access); +out_destroy: + iommufd_access_destroy_internal(viommu->ictx, access); +out_free: + kfree(pages); + return ERR_PTR(rc); +} + +int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_hw_queue_alloc *cmd = ucmd->cmd; + struct iommufd_hw_queue *hw_queue; + struct iommufd_viommu *viommu; + struct iommufd_access *access; + size_t hw_queue_size; + phys_addr_t base_pa; + u64 last; + int rc; + + if (cmd->flags || cmd->type == IOMMU_HW_QUEUE_TYPE_DEFAULT) + return -EOPNOTSUPP; + if (!cmd->length) + return -EINVAL; + if (check_add_overflow(cmd->nesting_parent_iova, cmd->length - 1, + &last)) + return -EOVERFLOW; + + viommu = iommufd_get_viommu(ucmd, cmd->viommu_id); + if (IS_ERR(viommu)) + return PTR_ERR(viommu); + + if (!viommu->ops || !viommu->ops->get_hw_queue_size || + !viommu->ops->hw_queue_init_phys) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue_size = viommu->ops->get_hw_queue_size(viommu, cmd->type); + if (!hw_queue_size) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + /* + * It is a driver bug for providing a hw_queue_size smaller than the + * core HW queue structure size + */ + if (WARN_ON_ONCE(hw_queue_size < sizeof(*hw_queue))) { + rc = -EOPNOTSUPP; + goto out_put_viommu; + } + + hw_queue = (struct iommufd_hw_queue *)_iommufd_object_alloc_ucmd( + ucmd, hw_queue_size, IOMMUFD_OBJ_HW_QUEUE); + if (IS_ERR(hw_queue)) { + rc = PTR_ERR(hw_queue); + goto out_put_viommu; + } + + access = iommufd_hw_queue_alloc_phys(cmd, viommu, &base_pa); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_viommu; + } + + hw_queue->viommu = viommu; + refcount_inc(&viommu->obj.users); + hw_queue->access = access; + hw_queue->type = cmd->type; + hw_queue->length = cmd->length; + hw_queue->base_addr = cmd->nesting_parent_iova; + + rc = viommu->ops->hw_queue_init_phys(hw_queue, cmd->index, base_pa); + if (rc) + goto out_put_viommu; + + cmd->out_hw_queue_id = hw_queue->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put_viommu: + iommufd_put_object(ucmd->ictx, &viommu->obj); + return rc; +} diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index f13f3ca6adb5e..ce4011a2fc270 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -123,6 +123,7 @@ struct iommufd_vdevice { struct iommufd_hw_queue { struct iommufd_object obj; struct iommufd_viommu *viommu; + struct iommufd_access *access; u64 base_addr; /* in guest physical address space */ size_t length; diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 6eb4bf855f782..687e4b91f762a 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -56,6 +56,7 @@ enum { IOMMUFD_CMD_VDEVICE_ALLOC = 0x91, IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, + IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, }; /** @@ -1149,4 +1150,37 @@ enum iommu_hw_queue_type { IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, }; +/** + * struct iommu_hw_queue_alloc - ioctl(IOMMU_HW_QUEUE_ALLOC) + * @size: sizeof(struct iommu_hw_queue_alloc) + * @flags: Must be 0 + * @viommu_id: Virtual IOMMU ID to associate the HW queue with + * @type: One of enum iommu_hw_queue_type + * @index: The logical index to the HW queue per virtual IOMMU for a multi-queue + * model + * @out_hw_queue_id: The ID of the new HW queue + * @nesting_parent_iova: Base address of the queue memory in the guest physical + * address space + * @length: Length of the queue memory + * + * Allocate a HW queue object for a vIOMMU-specific HW-accelerated queue, which + * allows HW to access a guest queue memory described using @nesting_parent_iova + * and @length. + * + * A vIOMMU can allocate multiple queues, but it must use a different @index per + * type to separate each allocation, e.g:: + * + * Type1 HW queue0, Type1 HW queue1, Type2 HW queue0, ... + */ +struct iommu_hw_queue_alloc { + __u32 size; + __u32 flags; + __u32 viommu_id; + __u32 type; + __u32 index; + __u32 out_hw_queue_id; + __aligned_u64 nesting_parent_iova; + __aligned_u64 length; +}; +#define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) #endif From 2e5546b6e4521ff619a0183c784f1317c3178353 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:07 -0700 Subject: [PATCH 099/147] iommufd/driver: Add iommufd_hw_queue_depend/undepend() helpers NVIDIA Virtual Command Queue is one of the iommufd users exposing vIOMMU features to user space VMs. Its hardware has a strict rule when mapping and unmapping multiple global CMDQVs to/from a VM-owned VINTF, requiring mappings in ascending order and unmappings in descending order. The tegra241-cmdqv driver can apply the rule for a mapping in the LVCMDQ allocation handler. However, it can't do the same for an unmapping since user space could start random destroy calls breaking the rule, while the destroy op in the driver level can't reject a destroy call as it returns void. Add iommufd_hw_queue_depend/undepend for-driver helpers, allowing LVCMDQ allocator to refcount_inc() a sibling LVCMDQ object and LVCMDQ destroyer to refcount_dec(), so that iommufd core will help block a random destroy call that breaks the rule. This is a bit of compromise, because a driver might end up with abusing the API that deadlocks the objects. So restrict the API to a dependency between two driver-allocated objects of the same type, as iommufd would unlikely build any core-level dependency in this case. And encourage to use the macro version that currently supports the HW QUEUE objects only. Link: https://patch.msgid.link/r/2735c32e759c82f2e6c87cb32134eaf09b7589b5.1752126748.git.nicolinc@nvidia.com Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 0b37d892d0425811618a737037b0212884cc25ae linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 28 ++++++++++++++++++++++ include/linux/iommufd.h | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index 887719016804c..e578ef32d30c0 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -3,6 +3,34 @@ */ #include "iommufd_private.h" +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + /* Reject self dependency that dead locks */ + if (obj_dependent == obj_depended) + return -EINVAL; + /* Only support dependency between two objects of the same type */ + if (obj_dependent->type != obj_depended->type) + return -EINVAL; + + refcount_inc(&obj_depended->users); + return 0; +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_depend, "IOMMUFD"); + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +void _iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + if (WARN_ON_ONCE(obj_dependent == obj_depended || + obj_dependent->type != obj_depended->type)) + return; + + refcount_dec(&obj_depended->users); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD"); + /* Caller should xa_lock(&viommu->vdevs) to protect the return value */ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index ce4011a2fc270..fa23439fa483b 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -251,6 +251,10 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx) #endif /* CONFIG_IOMMUFD */ #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER_CORE) +int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended); +void _iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, @@ -259,6 +263,18 @@ int iommufd_viommu_report_event(struct iommufd_viommu *viommu, enum iommu_veventq_type type, void *event_data, size_t data_len); #else /* !CONFIG_IOMMUFD_DRIVER_CORE */ +static inline int _iommufd_object_depend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ + return -EOPNOTSUPP; +} + +static inline void +_iommufd_object_undepend(struct iommufd_object *obj_dependent, + struct iommufd_object *obj_depended) +{ +} + static inline struct device * iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { @@ -298,4 +314,32 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, BUILD_BUG_ON_ZERO(!__same_type(struct iommufd_hw_queue, \ ((drv_struct *)NULL)->member))) +/* + * Helpers for IOMMU driver to build/destroy a dependency between two sibling + * structures created by one of the allocators above + */ +#define iommufd_hw_queue_depend(dependent, depended, member) \ + ({ \ + int ret = -EINVAL; \ + \ + static_assert(__same_type(struct iommufd_hw_queue, \ + dependent->member)); \ + static_assert(__same_type(typeof(*dependent), *depended)); \ + if (!WARN_ON_ONCE(dependent->member.viommu != \ + depended->member.viommu)) \ + ret = _iommufd_object_depend(&dependent->member.obj, \ + &depended->member.obj); \ + ret; \ + }) + +#define iommufd_hw_queue_undepend(dependent, depended, member) \ + ({ \ + static_assert(__same_type(struct iommufd_hw_queue, \ + dependent->member)); \ + static_assert(__same_type(typeof(*dependent), *depended)); \ + WARN_ON_ONCE(dependent->member.viommu != \ + depended->member.viommu); \ + _iommufd_object_undepend(&dependent->member.obj, \ + &depended->member.obj); \ + }) #endif From 3c049ac2caa35a6171e40f18d5e4056c2494901e Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:08 -0700 Subject: [PATCH 100/147] iommufd/selftest: Add coverage for IOMMUFD_CMD_HW_QUEUE_ALLOC Some simple tests for IOMMUFD_CMD_HW_QUEUE_ALLOC infrastructure covering the new iommufd_hw_queue_depend/undepend() helpers. Link: https://patch.msgid.link/r/e8a194d187d7ef445f43e4a3c04fb39472050afd.1752126748.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit 20896914da8ad24df0a77e24887912d87754fb83 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_test.h | 3 + drivers/iommu/iommufd/selftest.c | 97 +++++++++++++++++++ tools/testing/selftests/iommu/iommufd.c | 59 +++++++++++ .../selftests/iommu/iommufd_fail_nth.c | 6 ++ tools/testing/selftests/iommu/iommufd_utils.h | 31 ++++++ 5 files changed, 196 insertions(+) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index fbf9ecb35a137..51cd744a354f4 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -265,4 +265,7 @@ struct iommu_viommu_event_selftest { __u32 virt_id; }; +#define IOMMU_HW_QUEUE_TYPE_SELFTEST 0xdeadbeef +#define IOMMU_TEST_HW_QUEUE_MAX 2 + #endif diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 020160060e18d..75e0941cc1e49 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -147,6 +147,8 @@ to_mock_nested(struct iommu_domain *domain) struct mock_viommu { struct iommufd_viommu core; struct mock_iommu_domain *s2_parent; + struct mock_hw_queue *hw_queue[IOMMU_TEST_HW_QUEUE_MAX]; + struct mutex queue_mutex; }; static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) @@ -154,6 +156,19 @@ static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) return container_of(viommu, struct mock_viommu, core); } +struct mock_hw_queue { + struct iommufd_hw_queue core; + struct mock_viommu *mock_viommu; + struct mock_hw_queue *prev; + u16 index; +}; + +static inline struct mock_hw_queue * +to_mock_hw_queue(struct iommufd_hw_queue *hw_queue) +{ + return container_of(hw_queue, struct mock_hw_queue, core); +} + enum selftest_obj_type { TYPE_IDEV, }; @@ -631,9 +646,11 @@ static void mock_viommu_destroy(struct iommufd_viommu *viommu) { struct mock_iommu_device *mock_iommu = container_of( viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); if (refcount_dec_and_test(&mock_iommu->users)) complete(&mock_iommu->complete); + mutex_destroy(&mock_viommu->queue_mutex); /* iommufd core frees mock_viommu and viommu */ } @@ -725,10 +742,86 @@ static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu, return rc; } +static size_t mock_viommu_get_hw_queue_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) +{ + if (queue_type != IOMMU_HW_QUEUE_TYPE_SELFTEST) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct mock_hw_queue, core); +} + +static void mock_hw_queue_destroy(struct iommufd_hw_queue *hw_queue) +{ + struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue); + struct mock_viommu *mock_viommu = mock_hw_queue->mock_viommu; + + mutex_lock(&mock_viommu->queue_mutex); + mock_viommu->hw_queue[mock_hw_queue->index] = NULL; + if (mock_hw_queue->prev) + iommufd_hw_queue_undepend(mock_hw_queue, mock_hw_queue->prev, + core); + mutex_unlock(&mock_viommu->queue_mutex); +} + +/* Test iommufd_hw_queue_depend/undepend() */ +static int mock_hw_queue_init_phys(struct iommufd_hw_queue *hw_queue, u32 index, + phys_addr_t base_addr_pa) +{ + struct mock_viommu *mock_viommu = to_mock_viommu(hw_queue->viommu); + struct mock_hw_queue *mock_hw_queue = to_mock_hw_queue(hw_queue); + struct mock_hw_queue *prev = NULL; + int rc = 0; + + if (index >= IOMMU_TEST_HW_QUEUE_MAX) + return -EINVAL; + + mutex_lock(&mock_viommu->queue_mutex); + + if (mock_viommu->hw_queue[index]) { + rc = -EEXIST; + goto unlock; + } + + if (index) { + prev = mock_viommu->hw_queue[index - 1]; + if (!prev) { + rc = -EIO; + goto unlock; + } + } + + /* + * Test to catch a kernel bug if the core converted the physical address + * incorrectly. Let mock_domain_iova_to_phys() WARN_ON if it fails. + */ + if (base_addr_pa != iommu_iova_to_phys(&mock_viommu->s2_parent->domain, + hw_queue->base_addr)) { + rc = -EFAULT; + goto unlock; + } + + if (prev) { + rc = iommufd_hw_queue_depend(mock_hw_queue, prev, core); + if (rc) + goto unlock; + } + + mock_hw_queue->prev = prev; + mock_hw_queue->mock_viommu = mock_viommu; + mock_viommu->hw_queue[index] = mock_hw_queue; + + hw_queue->destroy = &mock_hw_queue_destroy; +unlock: + mutex_unlock(&mock_viommu->queue_mutex); + return rc; +} + static struct iommufd_viommu_ops mock_viommu_ops = { .destroy = mock_viommu_destroy, .alloc_domain_nested = mock_viommu_alloc_domain_nested, .cache_invalidate = mock_viommu_cache_invalidate, + .get_hw_queue_size = mock_viommu_get_hw_queue_size, + .hw_queue_init_phys = mock_hw_queue_init_phys, }; static size_t mock_get_viommu_size(struct device *dev, @@ -745,6 +838,7 @@ static int mock_viommu_init(struct iommufd_viommu *viommu, { struct mock_iommu_device *mock_iommu = container_of( viommu->iommu_dev, struct mock_iommu_device, iommu_dev); + struct mock_viommu *mock_viommu = to_mock_viommu(viommu); struct iommu_viommu_selftest data; int rc; @@ -762,6 +856,9 @@ static int mock_viommu_init(struct iommufd_viommu *viommu, } refcount_inc(&mock_iommu->users); + mutex_init(&mock_viommu->queue_mutex); + mock_viommu->s2_parent = to_mock_domain(parent_domain); + viommu->ops = &mock_viommu_ops; return 0; } diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index a9dfcce5e1b22..73426de77675a 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -3032,6 +3032,65 @@ TEST_F(iommufd_viommu, vdevice_cache) } } +TEST_F(iommufd_viommu, hw_queue) +{ + __u64 iova = MOCK_APERTURE_START, iova2; + uint32_t viommu_id = self->viommu_id; + uint32_t hw_queue_id[2]; + + if (!viommu_id) + SKIP(return, "Skipping test for variant no_viommu"); + + /* Fail IOMMU_HW_QUEUE_TYPE_DEFAULT */ + test_err_hw_queue_alloc(EOPNOTSUPP, viommu_id, + IOMMU_HW_QUEUE_TYPE_DEFAULT, 0, iova, PAGE_SIZE, + &hw_queue_id[0]); + /* Fail queue addr and length */ + test_err_hw_queue_alloc(EINVAL, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + 0, iova, 0, &hw_queue_id[0]); + test_err_hw_queue_alloc(EOVERFLOW, viommu_id, + IOMMU_HW_QUEUE_TYPE_SELFTEST, 0, ~(uint64_t)0, + PAGE_SIZE, &hw_queue_id[0]); + /* Fail missing iova */ + test_err_hw_queue_alloc(ENOENT, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + 0, iova, PAGE_SIZE, &hw_queue_id[0]); + + /* Map iova */ + test_ioctl_ioas_map(buffer, PAGE_SIZE, &iova); + test_ioctl_ioas_map(buffer + PAGE_SIZE, PAGE_SIZE, &iova2); + + /* Fail index=1 and =MAX; must start from index=0 */ + test_err_hw_queue_alloc(EIO, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, 1, + iova, PAGE_SIZE, &hw_queue_id[0]); + test_err_hw_queue_alloc(EINVAL, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + IOMMU_TEST_HW_QUEUE_MAX, iova, PAGE_SIZE, + &hw_queue_id[0]); + + /* Allocate index=0, declare ownership of the iova */ + test_cmd_hw_queue_alloc(viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, 0, + iova, PAGE_SIZE, &hw_queue_id[0]); + /* Fail duplicated index */ + test_err_hw_queue_alloc(EEXIST, viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, + 0, iova, PAGE_SIZE, &hw_queue_id[0]); + /* Fail unmap, due to iova ownership */ + test_err_ioctl_ioas_unmap(EBUSY, iova, PAGE_SIZE); + /* The 2nd page is not pinned, so it can be unmmap */ + test_ioctl_ioas_unmap(iova2, PAGE_SIZE); + + /* Allocate index=1, with an unaligned case */ + test_cmd_hw_queue_alloc(viommu_id, IOMMU_HW_QUEUE_TYPE_SELFTEST, 1, + iova + PAGE_SIZE / 2, PAGE_SIZE / 2, + &hw_queue_id[1]); + /* Fail to destroy, due to dependency */ + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hw_queue_id[0])); + + /* Destroy in descending order */ + test_ioctl_destroy(hw_queue_id[1]); + test_ioctl_destroy(hw_queue_id[0]); + /* Now it can unmap the first page */ + test_ioctl_ioas_unmap(iova, PAGE_SIZE); +} + FIXTURE(iommufd_device_pasid) { int fd; diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index f7ccf18221089..41c685bbd2522 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -634,6 +634,7 @@ TEST_FAIL_NTH(basic_fail_nth, device) uint32_t idev_id; uint32_t hwpt_id; uint32_t viommu_id; + uint32_t hw_queue_id; uint32_t vdev_id; __u64 iova; @@ -696,6 +697,11 @@ TEST_FAIL_NTH(basic_fail_nth, device) if (_test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, 0, &vdev_id)) return -1; + if (_test_cmd_hw_queue_alloc(self->fd, viommu_id, + IOMMU_HW_QUEUE_TYPE_SELFTEST, 0, iova, + PAGE_SIZE, &hw_queue_id)) + return -1; + if (_test_ioctl_fault_alloc(self->fd, &fault_id, &fault_fd)) return -1; close(fault_fd); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index a5d4cbd089bae..9a556f99d9924 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -956,6 +956,37 @@ static int _test_cmd_vdevice_alloc(int fd, __u32 viommu_id, __u32 idev_id, _test_cmd_vdevice_alloc(self->fd, viommu_id, idev_id, \ virt_id, vdev_id)) +static int _test_cmd_hw_queue_alloc(int fd, __u32 viommu_id, __u32 type, + __u32 idx, __u64 base_addr, __u64 length, + __u32 *hw_queue_id) +{ + struct iommu_hw_queue_alloc cmd = { + .size = sizeof(cmd), + .viommu_id = viommu_id, + .type = type, + .index = idx, + .nesting_parent_iova = base_addr, + .length = length, + }; + int ret; + + ret = ioctl(fd, IOMMU_HW_QUEUE_ALLOC, &cmd); + if (ret) + return ret; + if (hw_queue_id) + *hw_queue_id = cmd.out_hw_queue_id; + return 0; +} + +#define test_cmd_hw_queue_alloc(viommu_id, type, idx, base_addr, len, out_qid) \ + ASSERT_EQ(0, _test_cmd_hw_queue_alloc(self->fd, viommu_id, type, idx, \ + base_addr, len, out_qid)) +#define test_err_hw_queue_alloc(_errno, viommu_id, type, idx, base_addr, len, \ + out_qid) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_hw_queue_alloc(self->fd, viommu_id, type, idx, \ + base_addr, len, out_qid)) + static int _test_cmd_veventq_alloc(int fd, __u32 viommu_id, __u32 type, __u32 *veventq_id, __u32 *veventq_fd) { From f78a245e2997fa34c167bbe6d9dbaae88e5ed087 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:09 -0700 Subject: [PATCH 101/147] iommufd: Add mmap interface For vIOMMU passing through HW resources to user space (VMs), allowing a VM to control the passed through HW directly by accessing hardware registers, add an mmap infrastructure to map the physical MMIO pages to user space. Maintain a maple tree per ictx as a translation table managing mmappable regions, from an allocated for-user mmap offset to an iommufd_mmap struct, where it stores the real physical address range for io_remap_pfn_range(). Keep track of the lifecycle of the mmappable region by taking refcount of its owner, so as to enforce user space to unmap the region first before it can destroy its owner object. To allow an IOMMU driver to add and delete mmappable regions onto/from the maple tree, add iommufd_viommu_alloc/destroy_mmap helpers. Link: https://patch.msgid.link/r/9a888a326b12aa5fe940083eae1156304e210fe0.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Lu Baolu Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 56e9a0d8e53f56f313d332888a32a44a71f3a9ab linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/driver.c | 52 ++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 14 ++++++ drivers/iommu/iommufd/main.c | 63 +++++++++++++++++++++++++ include/linux/iommufd.h | 42 +++++++++++++++++ 4 files changed, 171 insertions(+) diff --git a/drivers/iommu/iommufd/driver.c b/drivers/iommu/iommufd/driver.c index e578ef32d30c0..e4eae20bcd4e8 100644 --- a/drivers/iommu/iommufd/driver.c +++ b/drivers/iommu/iommufd/driver.c @@ -31,6 +31,58 @@ void _iommufd_object_undepend(struct iommufd_object *obj_dependent, } EXPORT_SYMBOL_NS_GPL(_iommufd_object_undepend, "IOMMUFD"); +/* + * Allocate an @offset to return to user space to use for an mmap() syscall + * + * Driver should use a per-structure helper in include/linux/iommufd.h + */ +int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset) +{ + struct iommufd_mmap *immap; + unsigned long startp; + int rc; + + if (!PAGE_ALIGNED(mmio_addr)) + return -EINVAL; + if (!length || !PAGE_ALIGNED(length)) + return -EINVAL; + + immap = kzalloc(sizeof(*immap), GFP_KERNEL); + if (!immap) + return -ENOMEM; + immap->owner = owner; + immap->length = length; + immap->mmio_addr = mmio_addr; + + /* Skip the first page to ease caller identifying the returned offset */ + rc = mtree_alloc_range(&ictx->mt_mmap, &startp, immap, immap->length, + PAGE_SIZE, ULONG_MAX, GFP_KERNEL); + if (rc < 0) { + kfree(immap); + return rc; + } + + /* mmap() syscall will right-shift the offset in vma->vm_pgoff too */ + immap->vm_pgoff = startp >> PAGE_SHIFT; + *offset = startp; + return 0; +} +EXPORT_SYMBOL_NS_GPL(_iommufd_alloc_mmap, "IOMMUFD"); + +/* Driver should use a per-structure helper in include/linux/iommufd.h */ +void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, unsigned long offset) +{ + struct iommufd_mmap *immap; + + immap = mtree_erase(&ictx->mt_mmap, offset); + WARN_ON_ONCE(!immap || immap->owner != owner); + kfree(immap); +} +EXPORT_SYMBOL_NS_GPL(_iommufd_destroy_mmap, "IOMMUFD"); + /* Caller should xa_lock(&viommu->vdevs) to protect the return value */ struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 1c12e5ae01f9c..ee026fe6f382a 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ struct iommufd_ctx { struct xarray groups; wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; + struct maple_tree mt_mmap; struct mutex sw_msi_lock; struct list_head sw_msi_list; @@ -55,6 +57,18 @@ struct iommufd_ctx { struct iommufd_ioas *vfio_ioas; }; +/* Entry for iommufd_ctx::mt_mmap */ +struct iommufd_mmap { + struct iommufd_object *owner; + + /* Page-shifted start position in mt_mmap to validate vma->vm_pgoff */ + unsigned long vm_pgoff; + + /* Physical range for io_remap_pfn_range() */ + phys_addr_t mmio_addr; + size_t length; +}; + /* * The IOVA to PFN map. The map automatically copies the PFNs into multiple * domains and permits sharing of PFNs between io_pagetable instances. This diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 4e8dbbfac8900..0fb81a905cb13 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -275,6 +275,7 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); ictx->file = filp; + mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE); init_waitqueue_head(&ictx->destroy_wait); mutex_init(&ictx->sw_msi_lock); INIT_LIST_HEAD(&ictx->sw_msi_list); @@ -479,11 +480,73 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, return ret; } +static void iommufd_fops_vma_open(struct vm_area_struct *vma) +{ + struct iommufd_mmap *immap = vma->vm_private_data; + + refcount_inc(&immap->owner->users); +} + +static void iommufd_fops_vma_close(struct vm_area_struct *vma) +{ + struct iommufd_mmap *immap = vma->vm_private_data; + + refcount_dec(&immap->owner->users); +} + +static const struct vm_operations_struct iommufd_vma_ops = { + .open = iommufd_fops_vma_open, + .close = iommufd_fops_vma_close, +}; + +/* The vm_pgoff must be pre-allocated from mt_mmap, and given to user space */ +static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct iommufd_ctx *ictx = filp->private_data; + size_t length = vma->vm_end - vma->vm_start; + struct iommufd_mmap *immap; + int rc; + + if (!PAGE_ALIGNED(length)) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + if (vma->vm_flags & VM_EXEC) + return -EPERM; + + /* vma->vm_pgoff carries a page-shifted start position to an immap */ + immap = mtree_load(&ictx->mt_mmap, vma->vm_pgoff << PAGE_SHIFT); + if (!immap) + return -ENXIO; + /* + * mtree_load() returns the immap for any contained mmio_addr, so only + * allow the exact immap thing to be mapped + */ + if (vma->vm_pgoff != immap->vm_pgoff || length != immap->length) + return -ENXIO; + + vma->vm_pgoff = 0; + vma->vm_private_data = immap; + vma->vm_ops = &iommufd_vma_ops; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + rc = io_remap_pfn_range(vma, vma->vm_start, + immap->mmio_addr >> PAGE_SHIFT, length, + vma->vm_page_prot); + if (rc) + return rc; + + /* vm_ops.open won't be called for mmap itself. */ + refcount_inc(&immap->owner->users); + return rc; +} + static const struct file_operations iommufd_fops = { .owner = THIS_MODULE, .open = iommufd_fops_open, .release = iommufd_fops_release, .unlocked_ioctl = iommufd_fops_ioctl, + .mmap = iommufd_fops_mmap, }; /** diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index fa23439fa483b..e3a0cd47384d0 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -255,6 +255,11 @@ int _iommufd_object_depend(struct iommufd_object *obj_dependent, struct iommufd_object *obj_depended); void _iommufd_object_undepend(struct iommufd_object *obj_dependent, struct iommufd_object *obj_depended); +int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset); +void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, unsigned long offset); struct device *iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id); int iommufd_viommu_get_vdev_id(struct iommufd_viommu *viommu, @@ -275,6 +280,20 @@ _iommufd_object_undepend(struct iommufd_object *obj_dependent, { } +static inline int _iommufd_alloc_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, + phys_addr_t mmio_addr, size_t length, + unsigned long *offset) +{ + return -EOPNOTSUPP; +} + +static inline void _iommufd_destroy_mmap(struct iommufd_ctx *ictx, + struct iommufd_object *owner, + unsigned long offset) +{ +} + static inline struct device * iommufd_viommu_find_dev(struct iommufd_viommu *viommu, unsigned long vdev_id) { @@ -342,4 +361,27 @@ static inline int iommufd_viommu_report_event(struct iommufd_viommu *viommu, _iommufd_object_undepend(&dependent->member.obj, \ &depended->member.obj); \ }) + +/* + * Helpers for IOMMU driver to alloc/destroy an mmapable area for a structure. + * + * To support an mmappable MMIO region, kernel driver must first register it to + * iommufd core to allocate an @offset, during a driver-structure initialization + * (e.g. viommu_init op). Then, it should report to user space this @offset and + * the @length of the MMIO region for mmap syscall. + */ +static inline int iommufd_viommu_alloc_mmap(struct iommufd_viommu *viommu, + phys_addr_t mmio_addr, + size_t length, + unsigned long *offset) +{ + return _iommufd_alloc_mmap(viommu->ictx, &viommu->obj, mmio_addr, + length, offset); +} + +static inline void iommufd_viommu_destroy_mmap(struct iommufd_viommu *viommu, + unsigned long offset) +{ + _iommufd_destroy_mmap(viommu->ictx, &viommu->obj, offset); +} #endif From e310ce6a44967e554126a4c12489ad089d15228a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:10 -0700 Subject: [PATCH 102/147] iommufd/selftest: Add coverage for the new mmap interface Extend the loopback test to a new mmap page. Link: https://patch.msgid.link/r/b02b1220c955c3cf9ea5dd9fe9349ab1b4f8e20b.1752126748.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit 80478a2b450e984b5d270d0d7088912d64e84303 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/iommufd_test.h | 4 +++ drivers/iommu/iommufd/selftest.c | 33 ++++++++++++++++++- tools/testing/selftests/iommu/iommufd.c | 19 +++++++++++ tools/testing/selftests/iommu/iommufd_utils.h | 4 +++ 4 files changed, 59 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 51cd744a354f4..8fc618b2bcf96 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -232,12 +232,16 @@ struct iommu_hwpt_invalidate_selftest { * (IOMMU_VIOMMU_TYPE_SELFTEST) * @in_data: Input random data from user space * @out_data: Output data (matching @in_data) to user space + * @out_mmap_offset: The offset argument for mmap syscall + * @out_mmap_length: The length argument for mmap syscall * * Simply set @out_data=@in_data for a loopback test */ struct iommu_viommu_selftest { __u32 in_data; __u32 out_data; + __aligned_u64 out_mmap_offset; + __aligned_u64 out_mmap_length; }; /* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */ diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 75e0941cc1e49..8f04784b8faaa 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -149,6 +149,9 @@ struct mock_viommu { struct mock_iommu_domain *s2_parent; struct mock_hw_queue *hw_queue[IOMMU_TEST_HW_QUEUE_MAX]; struct mutex queue_mutex; + + unsigned long mmap_offset; + u32 *page; /* Mmap page to test u32 type of in_data */ }; static inline struct mock_viommu *to_mock_viommu(struct iommufd_viommu *viommu) @@ -650,6 +653,10 @@ static void mock_viommu_destroy(struct iommufd_viommu *viommu) if (refcount_dec_and_test(&mock_iommu->users)) complete(&mock_iommu->complete); + if (mock_viommu->mmap_offset) + iommufd_viommu_destroy_mmap(&mock_viommu->core, + mock_viommu->mmap_offset); + free_page((unsigned long)mock_viommu->page); mutex_destroy(&mock_viommu->queue_mutex); /* iommufd core frees mock_viommu and viommu */ @@ -848,11 +855,28 @@ static int mock_viommu_init(struct iommufd_viommu *viommu, if (rc) return rc; + /* Allocate two pages */ + mock_viommu->page = + (u32 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!mock_viommu->page) + return -ENOMEM; + + rc = iommufd_viommu_alloc_mmap(&mock_viommu->core, + __pa(mock_viommu->page), + PAGE_SIZE * 2, + &mock_viommu->mmap_offset); + if (rc) + goto err_free_page; + + /* For loopback tests on both the page and out_data */ + *mock_viommu->page = data.in_data; data.out_data = data.in_data; + data.out_mmap_length = PAGE_SIZE * 2; + data.out_mmap_offset = mock_viommu->mmap_offset; rc = iommu_copy_struct_to_user( user_data, &data, IOMMU_VIOMMU_TYPE_SELFTEST, out_data); if (rc) - return rc; + goto err_destroy_mmap; } refcount_inc(&mock_iommu->users); @@ -861,6 +885,13 @@ static int mock_viommu_init(struct iommufd_viommu *viommu, viommu->ops = &mock_viommu_ops; return 0; + +err_destroy_mmap: + iommufd_viommu_destroy_mmap(&mock_viommu->core, + mock_viommu->mmap_offset); +err_free_page: + free_page((unsigned long)mock_viommu->page); + return rc; } static const struct iommu_ops mock_ops = { diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 73426de77675a..0b21c095ca5e0 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -2799,6 +2799,7 @@ TEST_F(iommufd_viommu, viommu_alloc_with_data) struct iommu_viommu_selftest data = { .in_data = 0xbeef, }; + uint32_t *test; if (!self->device_id) SKIP(return, "Skipping test for variant no_viommu"); @@ -2807,6 +2808,24 @@ TEST_F(iommufd_viommu, viommu_alloc_with_data) IOMMU_VIOMMU_TYPE_SELFTEST, &data, sizeof(data), &self->viommu_id); ASSERT_EQ(data.out_data, data.in_data); + + /* Negative mmap tests -- offset and length cannot be changed */ + test_err_mmap(ENXIO, data.out_mmap_length, + data.out_mmap_offset + PAGE_SIZE); + test_err_mmap(ENXIO, data.out_mmap_length, + data.out_mmap_offset + PAGE_SIZE * 2); + test_err_mmap(ENXIO, data.out_mmap_length / 2, data.out_mmap_offset); + test_err_mmap(ENXIO, data.out_mmap_length * 2, data.out_mmap_offset); + + /* Now do a correct mmap for a loopback test */ + test = mmap(NULL, data.out_mmap_length, PROT_READ | PROT_WRITE, + MAP_SHARED, self->fd, data.out_mmap_offset); + ASSERT_NE(MAP_FAILED, test); + ASSERT_EQ(data.in_data, *test); + + /* The owner of the mmap region should be blocked */ + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, self->viommu_id)); + munmap(test, data.out_mmap_length); } TEST_F(iommufd_viommu, vdevice_alloc) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 9a556f99d9924..4a1b2bade0188 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -56,6 +56,10 @@ static unsigned long PAGE_SIZE; #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) +#define test_err_mmap(_errno, length, offset) \ + EXPECT_ERRNO(_errno, (long)mmap(NULL, length, PROT_READ | PROT_WRITE, \ + MAP_SHARED, self->fd, offset)) + static inline void *memfd_mmap(size_t length, int prot, int flags, int *mfd_p) { int mfd_flags = (flags & MAP_HUGETLB) ? MFD_HUGETLB : 0; From f73b78444dbf3654fbbf3a802bada581fc639cfe Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:11 -0700 Subject: [PATCH 103/147] Documentation: userspace-api: iommufd: Update HW QUEUE With the introduction of the new object and its infrastructure, update the doc to reflect that. Link: https://patch.msgid.link/r/caa3ddc0d9bacf05c5b3e02c5f306ff3172cc54d.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Lu Baolu Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 035c9211f05befe3fa2765c00356d32974176a94 linux-next) Signed-off-by: Nirmoy Das --- Documentation/userspace-api/iommufd.rst | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Documentation/userspace-api/iommufd.rst b/Documentation/userspace-api/iommufd.rst index b0df15865dec6..03f7510384d26 100644 --- a/Documentation/userspace-api/iommufd.rst +++ b/Documentation/userspace-api/iommufd.rst @@ -124,6 +124,17 @@ Following IOMMUFD objects are exposed to userspace: used to allocate a vEVENTQ. Each vIOMMU can support multiple types of vEVENTS, but is confined to one vEVENTQ per vEVENTQ type. +- IOMMUFD_OBJ_HW_QUEUE, representing a hardware accelerated queue, as a subset + of IOMMU's virtualization features, for the IOMMU HW to directly read or write + the virtual queue memory owned by a guest OS. This HW-acceleration feature can + allow VM to work with the IOMMU HW directly without a VM Exit, so as to reduce + overhead from the hypercalls. Along with the HW QUEUE object, iommufd provides + user space an mmap interface for VMM to mmap a physical MMIO region from the + host physical address space to the guest physical address space, allowing the + guest OS to directly control the allocated HW QUEUE. Thus, when allocating a + HW QUEUE, the VMM must request a pair of mmap info (offset/length) and pass in + exactly to an mmap syscall via its offset and length arguments. + All user-visible objects are destroyed via the IOMMU_DESTROY uAPI. The diagrams below show relationships between user-visible objects and kernel @@ -270,6 +281,7 @@ User visible objects are backed by following datastructures: - iommufd_viommu for IOMMUFD_OBJ_VIOMMU. - iommufd_vdevice for IOMMUFD_OBJ_VDEVICE. - iommufd_veventq for IOMMUFD_OBJ_VEVENTQ. +- iommufd_hw_queue for IOMMUFD_OBJ_HW_QUEUE. Several terminologies when looking at these datastructures: From 22712488530495c256a919145ecc56e6b96e117b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:12 -0700 Subject: [PATCH 104/147] iommu: Allow an input type in hw_info op The hw_info uAPI will support a bidirectional data_type field that can be used as an input field for user space to request for a specific info data. To prepare for the uAPI update, change the iommu layer first: - Add a new IOMMU_HW_INFO_TYPE_DEFAULT as an input, for which driver can output its only (or firstly) supported type - Update the kdoc accordingly - Roll out the type validation in the existing drivers Link: https://patch.msgid.link/r/00f4a2d3d930721f61367014717b3ba2d1e82a81.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 62622a8753fa6af3c104f9552863e6473b92fb31 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 4 ++++ drivers/iommu/intel/iommu.c | 4 ++++ drivers/iommu/iommufd/device.c | 3 +++ drivers/iommu/iommufd/selftest.c | 4 ++++ include/linux/iommu.h | 3 ++- include/uapi/linux/iommufd.h | 4 +++- 6 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 170d691628487..eb9fe1f6311a0 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -15,6 +15,10 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 __iomem *base_idr; unsigned int i; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) + return ERR_PTR(-EOPNOTSUPP); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3dea3647d4e3f..16cd6dff4c513 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -4238,6 +4238,10 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, struct intel_iommu *iommu = info->iommu; struct iommu_hw_info_vtd *vtd; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_INTEL_VTD) + return ERR_PTR(-EOPNOTSUPP); + vtd = kzalloc(sizeof(*vtd), GFP_KERNEL); if (!vtd) return ERR_PTR(-ENOMEM); diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 4589bbfcc4acb..b58c6b60c59bb 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1513,6 +1513,9 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) cmd->__reserved[2]) return -EOPNOTSUPP; + /* Clear the type field since drivers don't support a random input */ + cmd->out_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; + idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 8f04784b8faaa..6213f43f22580 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -291,6 +291,10 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, { struct iommu_test_hw_info *info; + if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && + *type != IOMMU_HW_INFO_TYPE_SELFTEST) + return ERR_PTR(-EOPNOTSUPP); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 5ce812fff4ee3..5d743cdec881f 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -608,7 +608,8 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data, * @capable: check capability * @hw_info: report iommu hardware information. The data buffer returned by this * op is allocated in the iommu driver and freed by the caller after - * use. + * use. @type can input a requested type and output a supported type. + * Driver should reject an unsupported data @type input * @domain_alloc: Do not use in new drivers * @domain_alloc_identity: allocate an IDENTITY domain. Drivers should prefer to * use identity_domain instead. This should only be used diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 687e4b91f762a..4ff84f95acd29 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -593,13 +593,15 @@ struct iommu_hw_info_arm_smmuv3 { /** * enum iommu_hw_info_type - IOMMU Hardware Info Types - * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware + * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware * info + * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, + IOMMU_HW_INFO_TYPE_DEFAULT = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, }; From 9e1fbb5cbd02ba0917e1d6f8c98cf4ede3e60ad1 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:13 -0700 Subject: [PATCH 105/147] iommufd: Allow an input data_type via iommu_hw_info The iommu_hw_info can output via the out_data_type field the vendor data type from a driver, but this only allows driver to report one data type. Now, with SMMUv3 having a Tegra241 CMDQV implementation, it has two sets of types and data structs to report. One way to support that is to use the same type field bidirectionally. Reuse the same field by adding an "in_data_type", allowing user space to request for a specific type and to get the corresponding data. For backward compatibility, since the ioctl handler has never checked an input value, add an IOMMU_HW_INFO_FLAG_INPUT_TYPE to switch between the old output-only field and the new bidirectional field. Link: https://patch.msgid.link/r/887378a7167e1786d9d13cde0c36263ed61823d7.1752126748.git.nicolinc@nvidia.com Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit a9f10bab2e5084d6746391fccd7bef6ac87620b8 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/device.c | 9 ++++++--- include/uapi/linux/iommufd.h | 20 +++++++++++++++++++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index b58c6b60c59bb..72916f2fd38e6 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1500,6 +1500,7 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD"); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) { + const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE; struct iommu_hw_info *cmd = ucmd->cmd; void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr); const struct iommu_ops *ops; @@ -1509,12 +1510,14 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) void *data; int rc; - if (cmd->flags || cmd->__reserved[0] || cmd->__reserved[1] || - cmd->__reserved[2]) + if (cmd->flags & ~SUPPORTED_FLAGS) + return -EOPNOTSUPP; + if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2]) return -EOPNOTSUPP; /* Clear the type field since drivers don't support a random input */ - cmd->out_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; + if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE)) + cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 4ff84f95acd29..2279af83451f2 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -628,6 +628,15 @@ enum iommufd_hw_capabilities { IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, }; +/** + * enum iommufd_hw_info_flags - Flags for iommu_hw_info + * @IOMMU_HW_INFO_FLAG_INPUT_TYPE: If set, @in_data_type carries an input type + * for user space to request for a specific info + */ +enum iommufd_hw_info_flags { + IOMMU_HW_INFO_FLAG_INPUT_TYPE = 1 << 0, +}; + /** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) @@ -637,6 +646,12 @@ enum iommufd_hw_capabilities { * data that kernel supports * @data_uptr: User pointer to a user-space buffer used by the kernel to fill * the iommu type specific hardware information data + * @in_data_type: This shares the same field with @out_data_type, making it be + * a bidirectional field. When IOMMU_HW_INFO_FLAG_INPUT_TYPE is + * set, an input type carried via this @in_data_type field will + * be valid, requesting for the info data to the given type. If + * IOMMU_HW_INFO_FLAG_INPUT_TYPE is unset, any input value will + * be seen as IOMMU_HW_INFO_TYPE_DEFAULT * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. * @out_capabilities: Output the generic iommu capability info type as defined @@ -666,7 +681,10 @@ struct iommu_hw_info { __u32 dev_id; __u32 data_len; __aligned_u64 data_uptr; - __u32 out_data_type; + union { + __u32 in_data_type; + __u32 out_data_type; + }; __u8 out_max_pasid_log2; __u8 __reserved[3]; __aligned_u64 out_capabilities; From 03a251bccd1867b2f5ab133e8ed2b5d19edeb7e3 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:14 -0700 Subject: [PATCH 106/147] iommufd/selftest: Update hw_info coverage for an input data_type Test both IOMMU_HW_INFO_TYPE_DEFAULT and IOMMU_HW_INFO_TYPE_SELFTEST, and add a negative test for an unsupported type. Also drop the unused mask in test_cmd_get_hw_capabilities() as checkpatch is complaining. Link: https://patch.msgid.link/r/f01a1e50cd7366f217cbf192ad0b2b79e0eb89f0.1752126748.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe (cherry picked from commit 3a35f7d4a4673edf6f02422bb2d78b17c667e167 linux-next) Signed-off-by: Nirmoy Das --- tools/testing/selftests/iommu/iommufd.c | 32 +++++++++++++----- .../selftests/iommu/iommufd_fail_nth.c | 4 +-- tools/testing/selftests/iommu/iommufd_utils.h | 33 +++++++++++-------- 3 files changed, 46 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 0b21c095ca5e0..d59d48022a24a 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -764,19 +764,34 @@ TEST_F(iommufd_ioas, get_hw_info) uint8_t max_pasid = 0; /* Provide a zero-size user_buffer */ - test_cmd_get_hw_info(self->device_id, NULL, 0); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, NULL, 0); /* Provide a user_buffer with exact size */ - test_cmd_get_hw_info(self->device_id, &buffer_exact, sizeof(buffer_exact)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_exact, + sizeof(buffer_exact)); + + /* Request for a wrong data_type, and a correct one */ + test_err_get_hw_info(EOPNOTSUPP, self->device_id, + IOMMU_HW_INFO_TYPE_SELFTEST + 1, + &buffer_exact, sizeof(buffer_exact)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_SELFTEST, &buffer_exact, + sizeof(buffer_exact)); /* * Provide a user_buffer with size larger than the exact size to check if * kernel zero the trailing bytes. */ - test_cmd_get_hw_info(self->device_id, &buffer_larger, sizeof(buffer_larger)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_larger, + sizeof(buffer_larger)); /* * Provide a user_buffer with size smaller than the exact size to check if * the fields within the size range still gets updated. */ - test_cmd_get_hw_info(self->device_id, &buffer_smaller, sizeof(buffer_smaller)); + test_cmd_get_hw_info(self->device_id, + IOMMU_HW_INFO_TYPE_DEFAULT, + &buffer_smaller, sizeof(buffer_smaller)); test_cmd_get_hw_info_pasid(self->device_id, &max_pasid); ASSERT_EQ(0, max_pasid); if (variant->pasid_capable) { @@ -786,9 +801,11 @@ TEST_F(iommufd_ioas, get_hw_info) } } else { test_err_get_hw_info(ENOENT, self->device_id, - &buffer_exact, sizeof(buffer_exact)); + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_exact, + sizeof(buffer_exact)); test_err_get_hw_info(ENOENT, self->device_id, - &buffer_larger, sizeof(buffer_larger)); + IOMMU_HW_INFO_TYPE_DEFAULT, &buffer_larger, + sizeof(buffer_larger)); } } @@ -2175,8 +2192,7 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 0, &hwpt_id); test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); - test_cmd_get_hw_capabilities(self->idev_id, caps, - IOMMU_HW_CAP_DIRTY_TRACKING); + test_cmd_get_hw_capabilities(self->idev_id, caps); ASSERT_EQ(IOMMU_HW_CAP_DIRTY_TRACKING, caps & IOMMU_HW_CAP_DIRTY_TRACKING); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index 41c685bbd2522..651fc9f13c081 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -667,8 +667,8 @@ TEST_FAIL_NTH(basic_fail_nth, device) &self->stdev_id, NULL, &idev_id)) return -1; - if (_test_cmd_get_hw_info(self->fd, idev_id, &info, - sizeof(info), NULL, NULL)) + if (_test_cmd_get_hw_info(self->fd, idev_id, IOMMU_HW_INFO_TYPE_DEFAULT, + &info, sizeof(info), NULL, NULL)) return -1; if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 4a1b2bade0188..5384852ce038c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -761,20 +761,24 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata) #endif /* @data can be NULL */ -static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, - size_t data_len, uint32_t *capabilities, - uint8_t *max_pasid) +static int _test_cmd_get_hw_info(int fd, __u32 device_id, __u32 data_type, + void *data, size_t data_len, + uint32_t *capabilities, uint8_t *max_pasid) { struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data; struct iommu_hw_info cmd = { .size = sizeof(cmd), .dev_id = device_id, .data_len = data_len, + .in_data_type = data_type, .data_uptr = (uint64_t)data, .out_capabilities = 0, }; int ret; + if (data_type != IOMMU_HW_INFO_TYPE_DEFAULT) + cmd.flags |= IOMMU_HW_INFO_FLAG_INPUT_TYPE; + ret = ioctl(fd, IOMMU_GET_HW_INFO, &cmd); if (ret) return ret; @@ -817,20 +821,23 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, return 0; } -#define test_cmd_get_hw_info(device_id, data, data_len) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \ - data_len, NULL, NULL)) +#define test_cmd_get_hw_info(device_id, data_type, data, data_len) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data_type, \ + data, data_len, NULL, NULL)) -#define test_err_get_hw_info(_errno, device_id, data, data_len) \ - EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \ - data_len, NULL, NULL)) +#define test_err_get_hw_info(_errno, device_id, data_type, data, data_len) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_get_hw_info(self->fd, device_id, data_type, \ + data, data_len, NULL, NULL)) -#define test_cmd_get_hw_capabilities(device_id, caps, mask) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, \ +#define test_cmd_get_hw_capabilities(device_id, caps) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ + IOMMU_HW_INFO_TYPE_DEFAULT, NULL, \ 0, &caps, NULL)) -#define test_cmd_get_hw_info_pasid(device_id, max_pasid) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, \ +#define test_cmd_get_hw_info_pasid(device_id, max_pasid) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ + IOMMU_HW_INFO_TYPE_DEFAULT, NULL, \ 0, NULL, max_pasid)) static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd) From d67b386b806a1ef0c6d772fdcf26db7a54dda55a Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:15 -0700 Subject: [PATCH 107/147] iommu/arm-smmu-v3-iommufd: Add vsmmu_size/type and vsmmu_init impl ops An impl driver might want to allocate its own type of vIOMMU object or the standard IOMMU_VIOMMU_TYPE_ARM_SMMUV3 by setting up its own SW/HW bits, as the tegra241-cmdqv driver will add IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV. Add vsmmu_size/type and vsmmu_init to struct arm_smmu_impl_ops. Prioritize them in arm_smmu_get_viommu_size() and arm_vsmmu_init(). Link: https://patch.msgid.link/r/375ac2b056764534bb7c10ecc4f34a0bae82b108.1752126748.git.nicolinc@nvidia.com Reviewed-by: Pranjal Shrivastava Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 61dd912ee02e4d1d412e1090c6e5d7f8cd0779df linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 8 ++++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index eb9fe1f6311a0..2ab1c6cf4aac3 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -416,6 +416,10 @@ size_t arm_smmu_get_viommu_size(struct device *dev, !(smmu->features & ARM_SMMU_FEAT_S2FWB)) return 0; + if (smmu->impl_ops && smmu->impl_ops->vsmmu_size && + viommu_type == smmu->impl_ops->vsmmu_type) + return smmu->impl_ops->vsmmu_size; + if (viommu_type != IOMMU_VIOMMU_TYPE_ARM_SMMUV3) return 0; @@ -439,6 +443,10 @@ int arm_vsmmu_init(struct iommufd_viommu *viommu, /* FIXME Move VMID allocation from the S2 domain allocation to here */ vsmmu->vmid = s2_parent->s2_cfg.vmid; + if (smmu->impl_ops && smmu->impl_ops->vsmmu_init && + viommu->type == smmu->impl_ops->vsmmu_type) + return smmu->impl_ops->vsmmu_init(vsmmu, user_data); + viommu->ops = &arm_vsmmu_ops; return 0; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index f0b0aae9b2fb9..e73b0e99ad848 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -16,6 +16,7 @@ #include struct arm_smmu_device; +struct arm_vsmmu; /* MMIO registers */ #define ARM_SMMU_IDR0 0x0 @@ -720,6 +721,10 @@ struct arm_smmu_impl_ops { int (*init_structures)(struct arm_smmu_device *smmu); struct arm_smmu_cmdq *(*get_secondary_cmdq)( struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent); + const size_t vsmmu_size; + const enum iommu_viommu_type vsmmu_type; + int (*vsmmu_init)(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); }; /* An SMMUv3 instance */ From e3b8c8b3b3951e2942b21fcecf2285587b37fd04 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:16 -0700 Subject: [PATCH 108/147] iommu/arm-smmu-v3-iommufd: Add hw_info to impl_ops This will be used by Tegra241 CMDQV implementation to report a non-default HW info data. Link: https://patch.msgid.link/r/8a3bf5709358eb21aed2e8434534c30ecf83917c.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 9eb6a666df7f7a50e0a99c4c101864b8bb0dd685 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 8 ++++++-- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 2ab1c6cf4aac3..1cf9646e776f8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -11,13 +11,17 @@ void *arm_smmu_hw_info(struct device *dev, u32 *length, enum iommu_hw_info_type *type) { struct arm_smmu_master *master = dev_iommu_priv_get(dev); + const struct arm_smmu_impl_ops *impl_ops = master->smmu->impl_ops; struct iommu_hw_info_arm_smmuv3 *info; u32 __iomem *base_idr; unsigned int i; if (*type != IOMMU_HW_INFO_TYPE_DEFAULT && - *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) - return ERR_PTR(-EOPNOTSUPP); + *type != IOMMU_HW_INFO_TYPE_ARM_SMMUV3) { + if (!impl_ops || !impl_ops->hw_info) + return ERR_PTR(-EOPNOTSUPP); + return impl_ops->hw_info(master->smmu, length, type); + } info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index e73b0e99ad848..d882a1f9db6f8 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -721,6 +721,13 @@ struct arm_smmu_impl_ops { int (*init_structures)(struct arm_smmu_device *smmu); struct arm_smmu_cmdq *(*get_secondary_cmdq)( struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent); + /* + * An implementation should define its own type other than the default + * IOMMU_HW_INFO_TYPE_ARM_SMMUV3. And it must validate the input @type + * to return its own structure. + */ + void *(*hw_info)(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type); const size_t vsmmu_size; const enum iommu_viommu_type vsmmu_type; int (*vsmmu_init)(struct arm_vsmmu *vsmmu, From 1c2d30d198a441ce5ffe5ea62972b6ea6ff2aead Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:17 -0700 Subject: [PATCH 109/147] iommu/tegra241-cmdqv: Use request_threaded_irq A vEVENT can be reported only from a threaded IRQ context. Change to using request_threaded_irq to support that. Link: https://patch.msgid.link/r/f160193980e3b273afbd1d9cfc3e360084c05ba6.1752126748.git.nicolinc@nvidia.com Acked-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 1eb468744ccaafeaee145505d0aa5fd6227bd72f linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index dd7d030d2e890..ba029f7d24ce6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -824,8 +824,9 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->dev = smmu->impl_dev; if (cmdqv->irq > 0) { - ret = request_irq(irq, tegra241_cmdqv_isr, 0, "tegra241-cmdqv", - cmdqv); + ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr, + IRQF_ONESHOT, "tegra241-cmdqv", + cmdqv); if (ret) { dev_err(cmdqv->dev, "failed to request irq (%d): %d\n", cmdqv->irq, ret); From 6d796419a3062bfc3d2d03b9795d6d732f2286ed Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:18 -0700 Subject: [PATCH 110/147] iommu/tegra241-cmdqv: Simplify deinit flow in tegra241_cmdqv_remove_vintf() The current flow of tegra241_cmdqv_remove_vintf() is: 1. For each LVCMDQ, tegra241_vintf_remove_lvcmdq(): a. Disable the LVCMDQ HW b. Release the LVCMDQ SW resource 2. For current VINTF, tegra241_vintf_hw_deinit(): c. Disable all LVCMDQ HWs d. Disable VINTF HW Obviously, the step 1.a and the step 2.c are redundant. Since tegra241_vintf_hw_deinit() disables all of its LVCMDQ HWs, it could simplify the flow in tegra241_cmdqv_remove_vintf() by calling that first: 1. For current VINTF, tegra241_vintf_hw_deinit(): a. Disable all LVCMDQ HWs b. Disable VINTF HW 2. Release all LVCMDQ SW resources Drop tegra241_vintf_remove_lvcmdq(), and move tegra241_vintf_free_lvcmdq() as the new step 2. Link: https://patch.msgid.link/r/86c97c8c4ee9ca192e7e7fa3007c10399d792ce6.1752126748.git.nicolinc@nvidia.com Acked-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 589899ee299e5314fae847d2ad0f86c2ffa94739 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index ba029f7d24ce6..8d418c131b1b0 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -628,24 +628,17 @@ static int tegra241_cmdqv_init_vintf(struct tegra241_cmdqv *cmdqv, u16 max_idx, /* Remove Helpers */ -static void tegra241_vintf_remove_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) -{ - tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]); - tegra241_vintf_free_lvcmdq(vintf, lidx); -} - static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx) { struct tegra241_vintf *vintf = cmdqv->vintfs[idx]; u16 lidx; + tegra241_vintf_hw_deinit(vintf); + /* Remove LVCMDQ resources */ for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) if (vintf->lvcmdqs[lidx]) - tegra241_vintf_remove_lvcmdq(vintf, lidx); - - /* Remove VINTF resources */ - tegra241_vintf_hw_deinit(vintf); + tegra241_vintf_free_lvcmdq(vintf, lidx); dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx); tegra241_cmdqv_deinit_vintf(cmdqv, idx); From d5aee1a24b7ec6340cb4a7d2d17cd84fe7f933db Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:19 -0700 Subject: [PATCH 111/147] iommu/tegra241-cmdqv: Do not statically map LVCMDQs To simplify the mappings from global VCMDQs to VINTFs' LVCMDQs, the design chose to do static allocations and mappings in the global reset function. However, with the user-owned VINTF support, it exposes a security concern: if user space VM only wants one LVCMDQ for a VINTF, statically mapping two or more LVCMDQs creates a hidden VCMDQ that user space could DoS attack by writing random stuff to overwhelm the kernel with unhandleable IRQs. Thus, to support the user-owned VINTF feature, a LVCMDQ mapping has to be done dynamically. HW allows pre-assigning global VCMDQs in the CMDQ_ALLOC registers, without finalizing the mappings by keeping CMDQV_CMDQ_ALLOCATED=0. So, add a pair of map/unmap helper that simply sets/clears that bit. For kernel-owned VINTF0, move LVCMDQ mappings to tegra241_vintf_hw_init(), and the unmappings to tegra241_vintf_hw_deinit(). For user-owned VINTFs that will be added, the mappings/unmappings will be on demand upon an LVCMDQ allocation from the user space. However, the dynamic LVCMDQ mapping/unmapping can complicate the timing of calling tegra241_vcmdq_hw_init/deinit(), which write LVCMDQ address space, i.e. requiring LVCMDQ to be mapped. Highlight that with a note to the top of either of them. Link: https://patch.msgid.link/r/be115a8f75537632daf5995b3e583d8a76553fba.1752126748.git.nicolinc@nvidia.com Acked-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 81f81db6328b2bab1b64cb1ebaa4c560bf91db9f linux-next) Signed-off-by: Nirmoy Das --- .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 8d418c131b1b0..869c90b660c11 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -351,6 +351,7 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, /* HW Reset Functions */ +/* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) { char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); @@ -379,6 +380,7 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h); } +/* This function is for LVCMDQ, so @vcmdq must be mapped prior */ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq) { char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); @@ -404,16 +406,42 @@ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq) return 0; } +/* Unmap a global VCMDQ from the pre-assigned LVCMDQ */ +static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq) +{ + u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + writel(regval & ~CMDQV_CMDQ_ALLOCATED, + REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + dev_dbg(vcmdq->cmdqv->dev, "%sunmapped\n", h); +} + static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) { - u16 lidx; + u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf; - for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) - if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) + /* HW requires to unmap LVCMDQs in descending order */ + while (lidx--) { + if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) { tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]); + tegra241_vcmdq_unmap_lvcmdq(vintf->lvcmdqs[lidx]); + } + } vintf_write_config(vintf, 0); } +/* Map a global VCMDQ to the pre-assigned LVCMDQ */ +static void tegra241_vcmdq_map_lvcmdq(struct tegra241_vcmdq *vcmdq) +{ + u32 regval = readl(REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + char header[64], *h = lvcmdq_error_header(vcmdq, header, 64); + + writel(regval | CMDQV_CMDQ_ALLOCATED, + REG_CMDQV(vcmdq->cmdqv, CMDQ_ALLOC(vcmdq->idx))); + dev_dbg(vcmdq->cmdqv->dev, "%smapped\n", h); +} + static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) { u32 regval; @@ -441,8 +469,10 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) */ vintf->hyp_own = !!(VINTF_HYP_OWN & readl(REG_VINTF(vintf, CONFIG))); + /* HW requires to map LVCMDQs in ascending order */ for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) { if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) { + tegra241_vcmdq_map_lvcmdq(vintf->lvcmdqs[lidx]); ret = tegra241_vcmdq_hw_init(vintf->lvcmdqs[lidx]); if (ret) { tegra241_vintf_hw_deinit(vintf); @@ -476,7 +506,6 @@ static int tegra241_cmdqv_hw_reset(struct arm_smmu_device *smmu) for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) { regval = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, idx); regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, lidx); - regval |= CMDQV_CMDQ_ALLOCATED; writel_relaxed(regval, REG_CMDQV(cmdqv, CMDQ_ALLOC(qidx++))); } From 3c7b2d3807d301223be32e05fe62b7e9b728975b Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:20 -0700 Subject: [PATCH 112/147] iommu/tegra241-cmdqv: Add user-space use support The CMDQV HW supports a user-space use for virtualization cases. It allows the VM to issue guest-level TLBI or ATC_INV commands directly to the queue and executes them without a VMEXIT, as HW will replace the VMID field in a TLBI command and the SID field in an ATC_INV command with the preset VMID and SID. This is built upon the vIOMMU infrastructure by allowing VMM to allocate a VINTF (as a vIOMMU object) and assign VCMDQs (HW QUEUE objs) to the VINTF. So firstly, replace the standard vSMMU model with the VINTF implementation but reuse the standard cache_invalidate op (for unsupported commands) and the standard alloc_domain_nested op (for standard nested STE). Each VINTF has two 64KB MMIO pages (128B per logical VCMDQ): - Page0 (directly accessed by guest) has all the control and status bits. - Page1 (trapped by VMM) has guest-owned queue memory location/size info. VMM should trap the emulated VINTF0's page1 of the guest VM for the guest- level VCMDQ location/size info and forward that to the kernel to translate to a physical memory location to program the VCMDQ HW during an allocation call. Then, it should mmap the assigned VINTF's page0 to the VINTF0 page0 of the guest VM. This allows the guest OS to read and write the guest-own VINTF's page0 for direct control of the VCMDQ HW. For ATC invalidation commands that hold an SID, it requires all devices to register their virtual SIDs to the SID_MATCH registers and their physical SIDs to the pairing SID_REPLACE registers, so that HW can use those as a lookup table to replace those virtual SIDs with the correct physical SIDs. Thus, implement the driver-allocated vDEVICE op with a tegra241_vintf_sid structure to allocate SID_REPLACE and to program the SIDs accordingly. This enables the HW accelerated feature for NVIDIA Grace CPU. Compared to the standard SMMUv3 operating in the nested translation mode trapping CMDQ for TLBI and ATC_INV commands, this gives a huge performance improvement: 70% to 90% reductions of invalidation time were measured by various DMA unmap tests running in a guest OS. Link: https://patch.msgid.link/r/fb0eab83f529440b6aa181798912a6f0afa21eb0.1752126748.git.nicolinc@nvidia.com Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 4dc0d12474f9d4833c3dd96b73d61e406d3f5dc7 linux-next) Signed-off-by: Nirmoy Das --- .../arm/arm-smmu-v3/arm-smmu-v3-iommufd.c | 6 +- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 + .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 400 +++++++++++++++++- include/uapi/linux/iommufd.h | 59 +++ 4 files changed, 466 insertions(+), 6 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c index 1cf9646e776f8..d9bea8f1f636d 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c @@ -225,7 +225,7 @@ static int arm_smmu_validate_vste(struct iommu_hwpt_arm_smmuv3 *arg, return 0; } -static struct iommu_domain * +struct iommu_domain * arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, const struct iommu_user_data *user_data) { @@ -336,8 +336,8 @@ static int arm_vsmmu_convert_user_cmd(struct arm_vsmmu *vsmmu, return 0; } -static int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, - struct iommu_user_data_array *array) +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array) { struct arm_vsmmu *vsmmu = container_of(viommu, struct arm_vsmmu, core); struct arm_smmu_device *smmu = vsmmu->smmu; diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index d882a1f9db6f8..b5841e4f54385 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -1082,10 +1082,17 @@ int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, void arm_smmu_attach_commit_vmaster(struct arm_smmu_attach_state *state); void arm_smmu_master_clear_vmaster(struct arm_smmu_master *master); int arm_vmaster_report_event(struct arm_smmu_vmaster *vmaster, u64 *evt); +struct iommu_domain * +arm_vsmmu_alloc_domain_nested(struct iommufd_viommu *viommu, u32 flags, + const struct iommu_user_data *user_data); +int arm_vsmmu_cache_invalidate(struct iommufd_viommu *viommu, + struct iommu_user_data_array *array); #else #define arm_smmu_get_viommu_size NULL #define arm_smmu_hw_info NULL #define arm_vsmmu_init NULL +#define arm_vsmmu_alloc_domain_nested NULL +#define arm_vsmmu_cache_invalidate NULL static inline int arm_smmu_attach_prepare_vmaster(struct arm_smmu_attach_state *state, diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 869c90b660c11..3eeb8444fadf1 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include @@ -26,8 +28,10 @@ #define CMDQV_EN BIT(0) #define TEGRA241_CMDQV_PARAM 0x0004 +#define CMDQV_NUM_SID_PER_VM_LOG2 GENMASK(15, 12) #define CMDQV_NUM_VINTF_LOG2 GENMASK(11, 8) #define CMDQV_NUM_VCMDQ_LOG2 GENMASK(7, 4) +#define CMDQV_VER GENMASK(3, 0) #define TEGRA241_CMDQV_STATUS 0x0008 #define CMDQV_ENABLED BIT(0) @@ -53,6 +57,9 @@ #define VINTF_STATUS GENMASK(3, 1) #define VINTF_ENABLED BIT(0) +#define TEGRA241_VINTF_SID_MATCH(s) (0x0040 + 0x4*(s)) +#define TEGRA241_VINTF_SID_REPLACE(s) (0x0080 + 0x4*(s)) + #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \ (0x00C0 + 0x8*(m)) #define LVCMDQ_ERR_MAP_NUM_64 2 @@ -114,16 +121,20 @@ MODULE_PARM_DESC(bypass_vcmdq, /** * struct tegra241_vcmdq - Virtual Command Queue + * @core: Embedded iommufd_hw_queue structure * @idx: Global index in the CMDQV * @lidx: Local index in the VINTF * @enabled: Enable status * @cmdqv: Parent CMDQV pointer * @vintf: Parent VINTF pointer + * @prev: Previous LVCMDQ to depend on * @cmdq: Command Queue struct * @page0: MMIO Page0 base address * @page1: MMIO Page1 base address */ struct tegra241_vcmdq { + struct iommufd_hw_queue core; + u16 idx; u16 lidx; @@ -131,22 +142,30 @@ struct tegra241_vcmdq { struct tegra241_cmdqv *cmdqv; struct tegra241_vintf *vintf; + struct tegra241_vcmdq *prev; struct arm_smmu_cmdq cmdq; void __iomem *page0; void __iomem *page1; }; +#define hw_queue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core) /** * struct tegra241_vintf - Virtual Interface + * @vsmmu: Embedded arm_vsmmu structure * @idx: Global index in the CMDQV * @enabled: Enable status * @hyp_own: Owned by hypervisor (in-kernel) * @cmdqv: Parent CMDQV pointer * @lvcmdqs: List of logical VCMDQ pointers + * @lvcmdq_mutex: Lock to serialize user-allocated lvcmdqs * @base: MMIO base address + * @mmap_offset: Offset argument for mmap() syscall + * @sids: Stream ID mapping resources */ struct tegra241_vintf { + struct arm_vsmmu vsmmu; + u16 idx; bool enabled; @@ -154,19 +173,41 @@ struct tegra241_vintf { struct tegra241_cmdqv *cmdqv; struct tegra241_vcmdq **lvcmdqs; + struct mutex lvcmdq_mutex; /* user space race */ void __iomem *base; + unsigned long mmap_offset; + + struct ida sids; +}; +#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, vsmmu.core) + +/** + * struct tegra241_vintf_sid - Virtual Interface Stream ID Mapping + * @core: Embedded iommufd_vdevice structure, holding virtual Stream ID + * @vintf: Parent VINTF pointer + * @sid: Physical Stream ID + * @idx: Mapping index in the VINTF + */ +struct tegra241_vintf_sid { + struct iommufd_vdevice core; + struct tegra241_vintf *vintf; + u32 sid; + u8 idx; }; +#define vdev_to_vsid(v) container_of(v, struct tegra241_vintf_sid, core) /** * struct tegra241_cmdqv - CMDQ-V for SMMUv3 * @smmu: SMMUv3 device * @dev: CMDQV device * @base: MMIO base address + * @base_phys: MMIO physical base address, for mmap * @irq: IRQ number * @num_vintfs: Total number of VINTFs * @num_vcmdqs: Total number of VCMDQs * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF + * @num_sids_per_vintf: Total number of SID mappings per VINTF * @vintf_ids: VINTF id allocator * @vintfs: List of VINTFs */ @@ -175,12 +216,14 @@ struct tegra241_cmdqv { struct device *dev; void __iomem *base; + phys_addr_t base_phys; int irq; /* CMDQV Hardware Params */ u16 num_vintfs; u16 num_vcmdqs; u16 num_lvcmdqs_per_vintf; + u16 num_sids_per_vintf; struct ida vintf_ids; @@ -351,6 +394,29 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu, /* HW Reset Functions */ +/* + * When a guest-owned VCMDQ is disabled, if the guest did not enqueue a CMD_SYNC + * following an ATC_INV command at the end of the guest queue while this ATC_INV + * is timed out, the TIMEOUT will not be reported until this VCMDQ gets assigned + * to the next VM, which will be a false alarm potentially causing some unwanted + * behavior in the new VM. Thus, a guest-owned VCMDQ must flush the TIMEOUT when + * it gets disabled. This can be done by just issuing a CMD_SYNC to SMMU CMDQ. + */ +static void tegra241_vcmdq_hw_flush_timeout(struct tegra241_vcmdq *vcmdq) +{ + struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu; + u64 cmd_sync[CMDQ_ENT_DWORDS] = {}; + + cmd_sync[0] = FIELD_PREP(CMDQ_0_OP, CMDQ_OP_CMD_SYNC) | + FIELD_PREP(CMDQ_SYNC_0_CS, CMDQ_SYNC_0_CS_NONE); + + /* + * It does not hurt to insert another CMD_SYNC, taking advantage of the + * arm_smmu_cmdq_issue_cmdlist() that waits for the CMD_SYNC completion. + */ + arm_smmu_cmdq_issue_cmdlist(smmu, &smmu->cmdq, cmd_sync, 1, true); +} + /* This function is for LVCMDQ, so @vcmdq must not be unmapped yet */ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) { @@ -364,6 +430,8 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq) readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)), readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS))); } + tegra241_vcmdq_hw_flush_timeout(vcmdq); + writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD)); writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS)); writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE)); @@ -420,6 +488,7 @@ static void tegra241_vcmdq_unmap_lvcmdq(struct tegra241_vcmdq *vcmdq) static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) { u16 lidx = vintf->cmdqv->num_lvcmdqs_per_vintf; + int sidx; /* HW requires to unmap LVCMDQs in descending order */ while (lidx--) { @@ -429,6 +498,10 @@ static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf) } } vintf_write_config(vintf, 0); + for (sidx = 0; sidx < vintf->cmdqv->num_sids_per_vintf; sidx++) { + writel(0, REG_VINTF(vintf, SID_MATCH(sidx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(sidx))); + } } /* Map a global VCMDQ to the pre-assigned LVCMDQ */ @@ -457,7 +530,8 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own) * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a * restricted set of supported commands. */ - regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own); + regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) | + FIELD_PREP(VINTF_VMID, vintf->vsmmu.vmid); writel(regval, REG_VINTF(vintf, CONFIG)); ret = vintf_write_config(vintf, regval | VINTF_EN); @@ -584,7 +658,9 @@ static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx) dev_dbg(vintf->cmdqv->dev, "%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64)); - kfree(vcmdq); + /* Guest-owned VCMDQ is free-ed with hw_queue by iommufd core */ + if (vcmdq->vintf->hyp_own) + kfree(vcmdq); } static struct tegra241_vcmdq * @@ -671,7 +747,13 @@ static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx) dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx); tegra241_cmdqv_deinit_vintf(cmdqv, idx); - kfree(vintf); + if (!vintf->hyp_own) { + mutex_destroy(&vintf->lvcmdq_mutex); + ida_destroy(&vintf->sids); + /* Guest-owned VINTF is free-ed with viommu by iommufd core */ + } else { + kfree(vintf); + } } static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) @@ -699,10 +781,45 @@ static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu) put_device(cmdqv->dev); /* smmu->impl_dev */ } +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data); + +static void *tegra241_cmdqv_hw_info(struct arm_smmu_device *smmu, u32 *length, + enum iommu_hw_info_type *type) +{ + struct tegra241_cmdqv *cmdqv = + container_of(smmu, struct tegra241_cmdqv, smmu); + struct iommu_hw_info_tegra241_cmdqv *info; + u32 regval; + + if (*type != IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + return ERR_PTR(-EOPNOTSUPP); + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return ERR_PTR(-ENOMEM); + + regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM)); + info->log2vcmdqs = ilog2(cmdqv->num_lvcmdqs_per_vintf); + info->log2vsids = ilog2(cmdqv->num_sids_per_vintf); + info->version = FIELD_GET(CMDQV_VER, regval); + + *length = sizeof(*info); + *type = IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV; + return info; +} + static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = { + /* For in-kernel use */ .get_secondary_cmdq = tegra241_cmdqv_get_cmdq, .device_reset = tegra241_cmdqv_hw_reset, .device_remove = tegra241_cmdqv_remove, + /* For user-space use */ + .hw_info = tegra241_cmdqv_hw_info, + .vsmmu_size = VIOMMU_STRUCT_SIZE(struct tegra241_vintf, vsmmu.core), + .vsmmu_type = IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + .vsmmu_init = tegra241_cmdqv_init_vintf_user, }; /* Probe Functions */ @@ -844,6 +961,7 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->irq = irq; cmdqv->base = base; cmdqv->dev = smmu->impl_dev; + cmdqv->base_phys = res->start; if (cmdqv->irq > 0) { ret = request_threaded_irq(irq, NULL, tegra241_cmdqv_isr, @@ -860,6 +978,8 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res, cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval); cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval); cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs; + cmdqv->num_sids_per_vintf = + 1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval); cmdqv->vintfs = kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL); @@ -913,3 +1033,277 @@ struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu) put_device(smmu->impl_dev); return ERR_PTR(-ENODEV); } + +/* User space VINTF and VCMDQ Functions */ + +static size_t tegra241_vintf_get_vcmdq_size(struct iommufd_viommu *viommu, + enum iommu_hw_queue_type queue_type) +{ + if (queue_type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return 0; + return HW_QUEUE_STRUCT_SIZE(struct tegra241_vcmdq, core); +} + +static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq) +{ + char header[64]; + + /* Configure the vcmdq only; User space does the enabling */ + writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE)); + + dev_dbg(vcmdq->cmdqv->dev, "%sinited at host PA 0x%llx size 0x%lx\n", + lvcmdq_error_header(vcmdq, header, 64), + vcmdq->cmdq.q.q_base & VCMDQ_ADDR, + 1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE)); + return 0; +} + +static void +tegra241_vintf_destroy_lvcmdq_user(struct iommufd_hw_queue *hw_queue) +{ + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + + mutex_lock(&vcmdq->vintf->lvcmdq_mutex); + tegra241_vcmdq_hw_deinit(vcmdq); + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_free_lvcmdq(vcmdq->vintf, vcmdq->lidx); + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); + mutex_unlock(&vcmdq->vintf->lvcmdq_mutex); +} + +static int tegra241_vintf_alloc_lvcmdq_user(struct iommufd_hw_queue *hw_queue, + u32 lidx, phys_addr_t base_addr_pa) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(hw_queue->viommu); + struct tegra241_vcmdq *vcmdq = hw_queue_to_vcmdq(hw_queue); + struct tegra241_cmdqv *cmdqv = vintf->cmdqv; + struct arm_smmu_device *smmu = &cmdqv->smmu; + struct tegra241_vcmdq *prev = NULL; + u32 log2size, max_n_shift; + char header[64]; + int ret; + + if (hw_queue->type != IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV) + return -EOPNOTSUPP; + if (lidx >= cmdqv->num_lvcmdqs_per_vintf) + return -EINVAL; + + mutex_lock(&vintf->lvcmdq_mutex); + + if (vintf->lvcmdqs[lidx]) { + ret = -EEXIST; + goto unlock; + } + + /* + * HW requires to map LVCMDQs in ascending order, so reject if the + * previous lvcmdqs is not allocated yet. + */ + if (lidx) { + prev = vintf->lvcmdqs[lidx - 1]; + if (!prev) { + ret = -EIO; + goto unlock; + } + } + + /* + * hw_queue->length must be a power of 2, in range of + * [ 32, 2 ^ (idr[1].CMDQS + CMDQ_ENT_SZ_SHIFT) ] + */ + max_n_shift = FIELD_GET(IDR1_CMDQS, + readl_relaxed(smmu->base + ARM_SMMU_IDR1)); + if (!is_power_of_2(hw_queue->length) || hw_queue->length < 32 || + hw_queue->length > (1 << (max_n_shift + CMDQ_ENT_SZ_SHIFT))) { + ret = -EINVAL; + goto unlock; + } + log2size = ilog2(hw_queue->length) - CMDQ_ENT_SZ_SHIFT; + + /* base_addr_pa must be aligned to hw_queue->length */ + if (base_addr_pa & ~VCMDQ_ADDR || + base_addr_pa & (hw_queue->length - 1)) { + ret = -EINVAL; + goto unlock; + } + + /* + * HW requires to unmap LVCMDQs in descending order, so destroy() must + * follow this rule. Set a dependency on its previous LVCMDQ so iommufd + * core will help enforce it. + */ + if (prev) { + ret = iommufd_hw_queue_depend(vcmdq, prev, core); + if (ret) + goto unlock; + } + vcmdq->prev = prev; + + ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq); + if (ret) + goto undepend_vcmdq; + + dev_dbg(cmdqv->dev, "%sallocated\n", + lvcmdq_error_header(vcmdq, header, 64)); + + tegra241_vcmdq_map_lvcmdq(vcmdq); + + vcmdq->cmdq.q.q_base = base_addr_pa & VCMDQ_ADDR; + vcmdq->cmdq.q.q_base |= log2size; + + ret = tegra241_vcmdq_hw_init_user(vcmdq); + if (ret) + goto unmap_lvcmdq; + + hw_queue->destroy = &tegra241_vintf_destroy_lvcmdq_user; + mutex_unlock(&vintf->lvcmdq_mutex); + return 0; + +unmap_lvcmdq: + tegra241_vcmdq_unmap_lvcmdq(vcmdq); + tegra241_vintf_deinit_lvcmdq(vintf, lidx); +undepend_vcmdq: + if (vcmdq->prev) + iommufd_hw_queue_undepend(vcmdq, vcmdq->prev, core); +unlock: + mutex_unlock(&vintf->lvcmdq_mutex); + return ret; +} + +static void tegra241_cmdqv_destroy_vintf_user(struct iommufd_viommu *viommu) +{ + struct tegra241_vintf *vintf = viommu_to_vintf(viommu); + + if (vintf->mmap_offset) + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, + vintf->mmap_offset); + tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx); +} + +static void tegra241_vintf_destroy_vsid(struct iommufd_vdevice *vdev) +{ + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct tegra241_vintf *vintf = vsid->vintf; + + writel(0, REG_VINTF(vintf, SID_MATCH(vsid->idx))); + writel(0, REG_VINTF(vintf, SID_REPLACE(vsid->idx))); + ida_free(&vintf->sids, vsid->idx); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: deallocated SID_REPLACE%d for pSID=%x\n", vintf->idx, + vsid->idx, vsid->sid); +} + +static int tegra241_vintf_init_vsid(struct iommufd_vdevice *vdev) +{ + struct arm_smmu_master *master = dev_iommu_priv_get(vdev->dev); + struct tegra241_vintf *vintf = viommu_to_vintf(vdev->viommu); + struct tegra241_vintf_sid *vsid = vdev_to_vsid(vdev); + struct arm_smmu_stream *stream = &master->streams[0]; + u64 virt_sid = vdev->virt_id; + int sidx; + + if (virt_sid > UINT_MAX) + return -EINVAL; + + WARN_ON_ONCE(master->num_streams != 1); + + /* Find an empty pair of SID_REPLACE and SID_MATCH */ + sidx = ida_alloc_max(&vintf->sids, vintf->cmdqv->num_sids_per_vintf - 1, + GFP_KERNEL); + if (sidx < 0) + return sidx; + + writel(stream->id, REG_VINTF(vintf, SID_REPLACE(sidx))); + writel(virt_sid << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(sidx))); + dev_dbg(vintf->cmdqv->dev, + "VINTF%u: allocated SID_REPLACE%d for pSID=%x, vSID=%x\n", + vintf->idx, sidx, stream->id, (u32)virt_sid); + + vsid->idx = sidx; + vsid->vintf = vintf; + vsid->sid = stream->id; + + vdev->destroy = &tegra241_vintf_destroy_vsid; + return 0; +} + +static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = { + .destroy = tegra241_cmdqv_destroy_vintf_user, + .alloc_domain_nested = arm_vsmmu_alloc_domain_nested, + /* Non-accelerated commands will be still handled by the kernel */ + .cache_invalidate = arm_vsmmu_cache_invalidate, + .vdevice_size = VDEVICE_STRUCT_SIZE(struct tegra241_vintf_sid, core), + .vdevice_init = tegra241_vintf_init_vsid, + .get_hw_queue_size = tegra241_vintf_get_vcmdq_size, + .hw_queue_init_phys = tegra241_vintf_alloc_lvcmdq_user, +}; + +static int +tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, + const struct iommu_user_data *user_data) +{ + struct tegra241_cmdqv *cmdqv = + container_of(vsmmu->smmu, struct tegra241_cmdqv, smmu); + struct tegra241_vintf *vintf = viommu_to_vintf(&vsmmu->core); + struct iommu_viommu_tegra241_cmdqv data; + phys_addr_t page0_base; + int ret; + + if (!user_data) + return -EINVAL; + + ret = iommu_copy_struct_from_user(&data, user_data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + return ret; + + ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf); + if (ret < 0) { + dev_err(cmdqv->dev, "no more available vintf\n"); + return ret; + } + + /* + * Initialize the user-owned VINTF without a LVCMDQ, as it cannot pre- + * allocate a LVCMDQ until user space wants one, for security reasons. + * It is different than the kernel-owned VINTF0, which had pre-assigned + * and pre-allocated global VCMDQs that would be mapped to the LVCMDQs + * by the tegra241_vintf_hw_init() call. + */ + ret = tegra241_vintf_hw_init(vintf, false); + if (ret) + goto deinit_vintf; + + page0_base = cmdqv->base_phys + TEGRA241_VINTFi_PAGE0(vintf->idx); + ret = iommufd_viommu_alloc_mmap(&vintf->vsmmu.core, page0_base, SZ_64K, + &vintf->mmap_offset); + if (ret) + goto hw_deinit_vintf; + + data.out_vintf_mmap_length = SZ_64K; + data.out_vintf_mmap_offset = vintf->mmap_offset; + ret = iommu_copy_struct_to_user(user_data, &data, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV, + out_vintf_mmap_length); + if (ret) + goto free_mmap; + + ida_init(&vintf->sids); + mutex_init(&vintf->lvcmdq_mutex); + + dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n", vintf->idx, + vintf->vsmmu.vmid); + + vsmmu->core.ops = &tegra241_cmdqv_viommu_ops; + return 0; + +free_mmap: + iommufd_viommu_destroy_mmap(&vintf->vsmmu.core, vintf->mmap_offset); +hw_deinit_vintf: + tegra241_vintf_hw_deinit(vintf); +deinit_vintf: + tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx); + return ret; +} diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 2279af83451f2..1b123c96a7bd3 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -591,6 +591,28 @@ struct iommu_hw_info_arm_smmuv3 { __u32 aidr; }; +/** + * struct iommu_hw_info_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Hardware + * Information (IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV) + * + * @flags: Must be 0 + * @version: Version number for the CMDQ-V HW for PARAM bits[03:00] + * @log2vcmdqs: Log2 of the total number of VCMDQs for PARAM bits[07:04] + * @log2vsids: Log2 of the total number of SID replacements for PARAM bits[15:12] + * @__reserved: Must be 0 + * + * VMM can use these fields directly in its emulated global PARAM register. Note + * that only one Virtual Interface (VINTF) should be exposed to a VM, i.e. PARAM + * bits[11:08] should be set to 0 for log2 of the total number of VINTFs. + */ +struct iommu_hw_info_tegra241_cmdqv { + __u32 flags; + __u8 version; + __u8 log2vcmdqs; + __u8 log2vsids; + __u8 __reserved; +}; + /** * enum iommu_hw_info_type - IOMMU Hardware Info Types * @IOMMU_HW_INFO_TYPE_NONE: Output by the drivers that do not report hardware @@ -598,12 +620,15 @@ struct iommu_hw_info_arm_smmuv3 { * @IOMMU_HW_INFO_TYPE_DEFAULT: Input to request for a default type * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type + * @IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) info type */ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_NONE = 0, IOMMU_HW_INFO_TYPE_DEFAULT = 0, IOMMU_HW_INFO_TYPE_INTEL_VTD = 1, IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2, + IOMMU_HW_INFO_TYPE_TEGRA241_CMDQV = 3, }; /** @@ -972,10 +997,29 @@ struct iommu_fault_alloc { * enum iommu_viommu_type - Virtual IOMMU Type * @IOMMU_VIOMMU_TYPE_DEFAULT: Reserved for future use * @IOMMU_VIOMMU_TYPE_ARM_SMMUV3: ARM SMMUv3 driver specific type + * @IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) enabled ARM SMMUv3 type */ enum iommu_viommu_type { IOMMU_VIOMMU_TYPE_DEFAULT = 0, IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, + IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2, +}; + +/** + * struct iommu_viommu_tegra241_cmdqv - NVIDIA Tegra241 CMDQV Virtual Interface + * (IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV) + * @out_vintf_mmap_offset: mmap offset argument for VINTF's page0 + * @out_vintf_mmap_length: mmap length argument for VINTF's page0 + * + * Both @out_vintf_mmap_offset and @out_vintf_mmap_length are reported by kernel + * for user space to mmap the VINTF page0 from the host physical address space + * to the guest physical address space so that a guest kernel can directly R/W + * access to the VINTF page0 in order to control its virtual command queues. + */ +struct iommu_viommu_tegra241_cmdqv { + __aligned_u64 out_vintf_mmap_offset; + __aligned_u64 out_vintf_mmap_length; }; /** @@ -1165,9 +1209,24 @@ struct iommu_veventq_alloc { /** * enum iommu_hw_queue_type - HW Queue Type * @IOMMU_HW_QUEUE_TYPE_DEFAULT: Reserved for future use + * @IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV (extension for ARM + * SMMUv3) Virtual Command Queue (VCMDQ) */ enum iommu_hw_queue_type { IOMMU_HW_QUEUE_TYPE_DEFAULT = 0, + /* + * TEGRA241_CMDQV requirements (otherwise, allocation will fail) + * - alloc starts from the lowest @index=0 in ascending order + * - destroy starts from the last allocated @index in descending order + * - @base_addr must be aligned to @length in bytes and mapped in IOAS + * - @length must be a power of 2, with a minimum 32 bytes and a maximum + * 2 ^ idr[1].CMDQS * 16 bytes (use GET_HW_INFO call to read idr[1] + * from struct iommu_hw_info_arm_smmuv3) + * - suggest to back the queue memory with contiguous physical pages or + * a single huge page with alignment of the queue size, and limit the + * emulated vSMMU's IDR1.CMDQS to log2(huge page size / 16 bytes) + */ + IOMMU_HW_QUEUE_TYPE_TEGRA241_CMDQV = 1, }; /** From 87f28b4236881b9defd07e1187ad8582d436832c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 9 Jul 2025 22:59:21 -0700 Subject: [PATCH 113/147] iommu/tegra241-cmdqv: Add IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV support Add a new vEVENTQ type for VINTFs that are assigned to the user space. Simply report the two 64-bit LVCMDQ_ERR_MAPs register values. Link: https://patch.msgid.link/r/68161a980da41fa5022841209638aeff258557b5.1752126748.git.nicolinc@nvidia.com Reviewed-by: Alok Tiwari Reviewed-by: Pranjal Shrivastava Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Signed-off-by: Jason Gunthorpe (cherry picked from commit 32b2d3a57e26804ca96d82a222667ac0fa226cb7 linux-next) Signed-off-by: Nirmoy Das --- .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 22 +++++++++++++++++++ include/uapi/linux/iommufd.h | 15 +++++++++++++ 2 files changed, 37 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index 3eeb8444fadf1..d5d43a1c77082 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -295,6 +295,20 @@ static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval) /* ISR Functions */ +static void tegra241_vintf_user_handle_error(struct tegra241_vintf *vintf) +{ + struct iommufd_viommu *viommu = &vintf->vsmmu.core; + struct iommu_vevent_tegra241_cmdqv vevent_data; + int i; + + for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) + vevent_data.lvcmdq_err_map[i] = + readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i))); + + iommufd_viommu_report_event(viommu, IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV, + &vevent_data, sizeof(vevent_data)); +} + static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf) { int i; @@ -340,6 +354,14 @@ static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid) vintf_map &= ~BIT_ULL(0); } + /* Handle other user VINTFs and their LVCMDQs */ + while (vintf_map) { + unsigned long idx = __ffs64(vintf_map); + + tegra241_vintf_user_handle_error(cmdqv->vintfs[idx]); + vintf_map &= ~BIT_ULL(idx); + } + return IRQ_HANDLED; } diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 1b123c96a7bd3..83204a5d5b95d 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1141,10 +1141,12 @@ struct iommufd_vevent_header { * enum iommu_veventq_type - Virtual Event Queue Type * @IOMMU_VEVENTQ_TYPE_DEFAULT: Reserved for future use * @IOMMU_VEVENTQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event Queue + * @IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension IRQ */ enum iommu_veventq_type { IOMMU_VEVENTQ_TYPE_DEFAULT = 0, IOMMU_VEVENTQ_TYPE_ARM_SMMUV3 = 1, + IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV = 2, }; /** @@ -1168,6 +1170,19 @@ struct iommu_vevent_arm_smmuv3 { __aligned_le64 evt[4]; }; +/** + * struct iommu_vevent_tegra241_cmdqv - Tegra241 CMDQV IRQ + * (IOMMU_VEVENTQ_TYPE_TEGRA241_CMDQV) + * @lvcmdq_err_map: 128-bit logical vcmdq error map, little-endian. + * (Refer to register LVCMDQ_ERR_MAPs per VINTF ) + * + * The 128-bit register value from HW exclusively reflect the error bits for a + * Virtual Interface represented by a vIOMMU object. Read and report directly. + */ +struct iommu_vevent_tegra241_cmdqv { + __aligned_le64 lvcmdq_err_map[2]; +}; + /** * struct iommu_veventq_alloc - ioctl(IOMMU_VEVENTQ_ALLOC) * @size: sizeof(struct iommu_veventq_alloc) From 311f0faae37109dec38bee15f6455c607bf080ae Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 10 Jul 2025 13:23:54 -0700 Subject: [PATCH 114/147] iommufd: Do not allow _iommufd_object_alloc_ucmd if abort op is set An abort op was introduced to allow its caller to invoke it within a lock in the caller's function. On the other hand, _iommufd_object_alloc_ucmd() would invoke the abort op in iommufd_object_abort_and_destroy() that must be outside the caller's lock. So, these two cannot work together. Add a validation in the _iommufd_object_alloc_ucmd(). Pick -EOPNOTSUPP to reject the function call, indicating that the object allocator is buggy. Link: https://patch.msgid.link/r/20250710202354.1658511-1-nicolinc@nvidia.com Suggested-by: Xu Yilun Signed-off-by: Nicolin Chen Reviewed-by: Kevin Tian Reviewed-by: Xu Yilun Signed-off-by: Jason Gunthorpe (cherry picked from commit 5510bd89da24508f0e9ae04396e7eb6929ec0e18 linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/iommufd/main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 0fb81a905cb13..69c2195e77cad 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -71,6 +71,15 @@ struct iommufd_object *_iommufd_object_alloc_ucmd(struct iommufd_ucmd *ucmd, if (WARN_ON(ucmd->new_obj)) return ERR_PTR(-EBUSY); + /* + * An abort op means that its caller needs to invoke it within a lock in + * the caller. So it doesn't work with _iommufd_object_alloc_ucmd() that + * will invoke the abort op in iommufd_object_abort_and_destroy(), which + * must be outside the caller's lock. + */ + if (WARN_ON(iommufd_object_ops[type].abort)) + return ERR_PTR(-EOPNOTSUPP); + new_obj = _iommufd_object_alloc(ucmd->ictx, size, type); if (IS_ERR(new_obj)) return new_obj; From 567a11034ce93666782b8ea47e4f7ed76c7ae4ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 14 Jul 2025 22:57:38 +0200 Subject: [PATCH 115/147] iommu/tegra241-cmdqv: import IOMMUFD module namespace The tegra variant of smmu-v3 now uses the iommufd mmap interface but is missing the corresponding import: ERROR: modpost: module arm_smmu_v3 uses symbol _iommufd_object_depend from namespace IOMMUFD, but does not import it. ERROR: modpost: module arm_smmu_v3 uses symbol iommufd_viommu_report_event from namespace IOMMUFD, but does not import it. ERROR: modpost: module arm_smmu_v3 uses symbol _iommufd_destroy_mmap from namespace IOMMUFD, but does not import it. ERROR: modpost: module arm_smmu_v3 uses symbol _iommufd_object_undepend from namespace IOMMUFD, but does not import it. ERROR: modpost: module arm_smmu_v3 uses symbol _iommufd_alloc_mmap from namespace IOMMUFD, but does not import it. Fixes: b135de24cfc0 ("iommu/tegra241-cmdqv: Add user-space use support") Link: https://patch.msgid.link/r/20250714205747.3475772-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe (cherry picked from commit 601b1d0d9395c711383452bd0d47037afbbb4bcf linux-next) Signed-off-by: Nirmoy Das --- drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c index d5d43a1c77082..eb90af5093d89 100644 --- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c +++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c @@ -1329,3 +1329,5 @@ tegra241_cmdqv_init_vintf_user(struct arm_vsmmu *vsmmu, tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx); return ret; } + +MODULE_IMPORT_NS("IOMMUFD"); From 00c8161f66160bb3c2b61faf6b39d5957b086c87 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Mon, 14 Jul 2025 06:05:31 -0700 Subject: [PATCH 116/147] NVIDIA: SAUCE: mm: handle poisoning of pfn without struct pages The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region is mapped using remap_pfn_range(), but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register a failure function and the physical address space associated with the device memory. MM maintains this information as interval tree. The registered memory failure function is used by MM to notify the kernel module managing the PFN, so that the module may take any required action. The module for example may use the information to track the poisoned pages. In this implementation, kernel MM follows the following sequence similar (mostly) to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() call the newly introduced failure handler exposed by the module managing the poisoned memory to notify it of the problematic PFN. 3. memory_failure_pfn() unmaps the stage-2 mapping to the PFN. 4. memory_failure_pfn() collects the processes mapped to the PFN. 5. memory_failure_pfn() sends SIGBUS (BUS_MCEERR_AO) to all the processes mapping the faulty PFN using kill_procs(). 6. An access to the faulty PFN by an operation in VM at a later point is trapped and user_mem_abort() is called. 7. The vma ops fault function gets called due to the absence of Stage-2 mapping. It is expected to return VM_FAULT_HWPOISON on the PFN. 8. __gfn_to_pfn_memslot() then returns KVM_PFN_ERR_HWPOISON, which cause the poison with SIGBUS (BUS_MCEERR_AR) to be sent to the QEMU process through kvm_send_hwpoison_signal(). Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (backported from commit f037dd72786db3c8b8a4f635dd22c6023e2371a4 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) (koba: Add a pgoff parameter to __add_to_kill) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (backported from commit 4bb248afa819b6e4b31abb41b9f95c19a4c4757a https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) [Nirmoy: s/folio_shift(page_folio(p))/page_shift(compound_head(p)), add missing arg in page_address_in_vma()] Signed-off-by: Nirmoy Das --- include/linux/memory-failure.h | 22 +++++ include/linux/mm.h | 1 + include/ras/ras_event.h | 1 + mm/Kconfig | 1 + mm/memory-failure.c | 149 ++++++++++++++++++++++++++++----- 5 files changed, 153 insertions(+), 21 deletions(-) create mode 100644 include/linux/memory-failure.h diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 0000000000000..9a579960972aa --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space_ops { + void (*failure)(struct pfn_address_space *pfn_space, unsigned long pfn); +}; + +struct pfn_address_space { + struct interval_tree_node node; + const struct pfn_address_space_ops *ops; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 72dbec7ed901e..318d33b54949c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4033,6 +4033,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index e5f7ee0864e78..bb1127f3e5b7e 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -373,6 +373,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db064172..8b17b8533cc1c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -810,6 +810,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select MEMORY_ISOLATION + select INTERVAL_TREE select RAS help Enables code to recover from some memory failures on systems diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b04fb434b6cf1..9bb055277466e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -60,6 +61,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" #include "ras/ras_event.h" @@ -154,6 +156,10 @@ static const struct ctl_table memory_failure_table[] = { } }; +static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED; + +static DEFINE_MUTEX(pfn_space_lock); + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -441,13 +447,22 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * not much we can do. We just print a message and ignore otherwise. */ +#define FSDAX_INVALID_PGOFF ULONG_MAX + /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * + * Notice: @pgoff is used when: + * a. @p is a fsdax page and a filesystem with a memory failure handler + * has claimed the memory_failure event. + * b. pgoff is not backed by struct page. + * In all other cases, page->index and page->mapping are sufficient + * for mapping the page back to its corresponding user virtual address. */ static void __add_to_kill(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, - unsigned long addr) + unsigned long ksm_addr, pgoff_t pgoff) { struct to_kill *tk; @@ -457,11 +472,20 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p, return; } - tk->addr = addr; - if (is_zone_device_page(p)) - tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); - else - tk->size_shift = folio_shift(page_folio(p)); + /* Check for pgoff not backed by struct page */ + if (!(pfn_valid(pgoff)) && (vma->vm_flags | PFN_MAP)) { + tk->addr = vma_address(vma, pgoff, 1); + tk->size_shift = PAGE_SHIFT; + } else { + tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(page_folio(p), p, vma); + if (is_zone_device_page(p)) { + if (pgoff != FSDAX_INVALID_PGOFF) + tk->addr = vma_address(vma, pgoff, 1); + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + } else { + tk->size_shift = folio_shift(page_folio(p)); + } + } /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -474,8 +498,8 @@ static void __add_to_kill(struct task_struct *tsk, const struct page *p, * has a mapping for the page. */ if (tk->addr == -EFAULT) { - pr_info("Unable to find user space address %lx in %s\n", - page_to_pfn(p), tsk->comm); + pr_info("Unable to find address %lx in %s\n", + pfn_valid(pgoff) ? page_to_pfn(p) : pgoff, tsk->comm); } else if (tk->size_shift == 0) { kfree(tk); return; @@ -492,7 +516,7 @@ static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p, { if (addr == -EFAULT) return; - __add_to_kill(tsk, p, vma, to_kill, addr); + __add_to_kill(tsk, p, vma, to_kill, addr, FSDAX_INVALID_PGOFF); } #ifdef CONFIG_KSM @@ -514,7 +538,7 @@ void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, unsigned long addr) { if (!task_in_to_kill_list(to_kill, tsk)) - __add_to_kill(tsk, p, vma, to_kill, addr); + __add_to_kill(tsk, p, vma, to_kill, addr, FSDAX_INVALID_PGOFF); } #endif /* @@ -681,21 +705,21 @@ static void collect_procs_file(const struct folio *folio, i_mmap_unlock_read(mapping); } -#ifdef CONFIG_FS_DAX -static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p, +static void add_to_kill_pgoff(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { unsigned long addr = vma_address(vma, pgoff, 1); - __add_to_kill(tsk, p, vma, to_kill, addr); + __add_to_kill(tsk, p, vma, to_kill, addr, pgoff); } /* - * Collect processes when the error hit a fsdax page. + * Collect processes when the error hit a fsdax page or a PFN not backed by + * struct page. */ -static void collect_procs_fsdax(const struct page *page, - struct address_space *mapping, pgoff_t pgoff, - struct list_head *to_kill, bool pre_remove) +static void collect_procs_pgoff(const struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill, bool pre_remove) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -716,13 +740,12 @@ static void collect_procs_fsdax(const struct page *page, continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { if (vma->vm_mm == t->mm) - add_to_kill_fsdax(t, page, vma, to_kill, pgoff); + add_to_kill_pgoff(t, page, vma, to_kill, pgoff); } } rcu_read_unlock(); i_mmap_unlock_read(mapping); } -#endif /* CONFIG_FS_DAX */ /* * Collect the processes who have the corrupted page mapped to kill. @@ -943,6 +966,7 @@ static const char * const action_page_types[] = { [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_ALREADY_POISONED] = "already poisoned", + [MF_MSG_PFN_MAP] = "non struct page pfn", [MF_MSG_UNKNOWN] = "unknown page", }; @@ -1337,7 +1361,8 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, num_poisoned_pages_inc(pfn); - update_per_node_mf_stats(pfn, result); + if (type != MF_MSG_PFN_MAP) + update_per_node_mf_stats(pfn, result); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1846,7 +1871,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, * The pre_remove case is revoking access, the memory is still * good and could theoretically be put back into service. */ - collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove); + collect_procs_pgoff(page, mapping, index, &to_kill, pre_remove); unmap_and_kill(&to_kill, page_to_pfn(page), mapping, index, mf_flags); unlock: @@ -2198,6 +2223,83 @@ static void kill_procs_now(struct page *p, unsigned long pfn, int flags, kill_procs(&tokill, true, pfn, flags); } +int register_pfn_address_space(struct pfn_address_space *pfn_space) +{ + if (!pfn_space) + return -EINVAL; + + if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT, + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, "")) + return -EBUSY; + + mutex_lock(&pfn_space_lock); + interval_tree_insert(&pfn_space->node, &pfn_space_itree); + mutex_unlock(&pfn_space_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_pfn_address_space); + +void unregister_pfn_address_space(struct pfn_address_space *pfn_space) +{ + if (!pfn_space) + return; + + mutex_lock(&pfn_space_lock); + interval_tree_remove(&pfn_space->node, &pfn_space_itree); + mutex_unlock(&pfn_space_lock); + release_mem_region(pfn_space->node.start << PAGE_SHIFT, + (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(unregister_pfn_address_space); + +static int memory_failure_pfn(unsigned long pfn, int flags) +{ + struct interval_tree_node *node; + int res = MF_FAILED; + LIST_HEAD(tokill); + + mutex_lock(&pfn_space_lock); + /* + * Modules registers with MM the address space mapping to the device memory they + * manage. Iterate to identify exactly which address space has mapped to this + * failing PFN. + */ + for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node; + node = interval_tree_iter_next(node, pfn, pfn)) { + struct pfn_address_space *pfn_space = + container_of(node, struct pfn_address_space, node); + /* + * Modules managing the device memory need to be conveyed about the + * memory failure so that the poisoned PFN can be tracked. + */ + if (pfn_space->ops) + pfn_space->ops->failure(pfn_space, pfn); + + collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill, false); + + unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT, + PAGE_SIZE, 0); + + res = MF_RECOVERED; + } + mutex_unlock(&pfn_space_lock); + + if (res == MF_FAILED) + return action_result(pfn, MF_MSG_PFN_MAP, res); + + /* + * Unlike System-RAM there is no possibility to swap in a different + * physical page at a given virtual address, so all userspace + * consumption of direct PFN memory necessitates SIGBUS (i.e. + * MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + kill_procs(&tokill, true, pfn, flags); + + return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED); +} + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -2237,6 +2339,11 @@ int memory_failure(unsigned long pfn, int flags) if (!(flags & MF_SW_SIMULATED)) hw_memory_failure = true; + if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) { + res = memory_failure_pfn(pfn, flags); + goto unlock_mutex; + } + p = pfn_to_online_page(pfn); if (!p) { res = arch_memory_failure(pfn, flags); From a538e6fcb666fa58b1890423edfe5590d7b452de Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Wed, 6 Dec 2023 04:32:33 +0000 Subject: [PATCH 117/147] NVIDIA: SAUCE: mm: Add poison error check in fixup_user_fault() for mapped pfn The fixup_user_fault() currently does not expect a VM_FAULT_HWPOISON and hence does not check for it while calling vm_fault_to_errno(). Since we now have a new code path which can trigger such case, change fixup_user_fault to look for VM_FAULT_HWPOISON. Also make hva_to_pfn_remapped check for -EHWPOISON and communicate the poison fault up to the user_mem_abort(). Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 3e895d53eee823f047cc1e9a38bca941d2d132dd https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (backported from commit 2c0d6ccea80b57850a8a2470cae46bdf667f6e47 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) [Nirmoy: fix few offset shifts, adopt to b176f4b41775] Signed-off-by: Nirmoy Das --- mm/gup.c | 2 +- virt/kvm/kvm_main.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 4ededc1133583..c2a6619612d8d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1667,7 +1667,7 @@ int fixup_user_fault(struct mm_struct *mm, } if (ret & VM_FAULT_ERROR) { - int err = vm_fault_to_errno(ret, 0); + int err = vm_fault_to_errno(ret, FOLL_HWPOISON); if (err) return err; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d33..bef00c86cc86f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2933,8 +2933,11 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp) r = hva_to_pfn_remapped(vma, kfp, &pfn); if (r == -EAGAIN) goto retry; - if (r < 0) + if (r < 0) { pfn = KVM_PFN_ERR_FAULT; + if (r == -EHWPOISON) + pfn = KVM_PFN_ERR_HWPOISON; + } } else { if ((kfp->flags & FOLL_NOWAIT) && vma_is_valid(vma, kfp->flags & FOLL_WRITE)) From 489bb413a025c13663700aa1a1178aa1ad5cd7cd Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Wed, 6 Dec 2023 04:33:27 +0000 Subject: [PATCH 118/147] NVIDIA: SAUCE: mm: Change ghes code to allow poison of non-struct pfn The GHES code allows calling of memory_failure() on the PFNs that pass the pfn_valid() check. This contract is broken for the remapped PFNs which fails the check and ghes_do_memory_failure() returns without triggering memory_failure(). Update code to allow memory_failure() call on PFNs failing pfn_valid(). Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit cbcf5ec9c4509cc30bb013ecaf4fa7191ec2b11f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 7e95d6c5a4eb2583c171c1adfc340f229c571821 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/acpi/apei/ghes.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index b727724946556..3a0c68fdc05e3 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -486,20 +486,10 @@ static void ghes_kick_task_work(struct callback_head *head) static bool ghes_do_memory_failure(u64 physical_addr, int flags) { - unsigned long pfn; - if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE)) return false; - pfn = PHYS_PFN(physical_addr); - if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) { - pr_warn_ratelimited(FW_WARN GHES_PFX - "Invalid address in generic error data: %#llx\n", - physical_addr); - return false; - } - - memory_failure_queue(pfn, flags); + memory_failure_queue(PHYS_PFN(physical_addr), flags); return true; } From 374b4eb53ee73544eb843558d2f7ce9d507ce67b Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 25 Feb 2024 12:35:06 +0000 Subject: [PATCH 119/147] NVIDIA: SAUCE: vfio/nvgrace-gpu: register device memory for poison handling The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA (Qemu) using remap_pfn_range() without adding the memory to the kernel. The device memory pages are not backed by struct page. Patches 1-3 implements the mechanism to handle ECC/poison on memory page without struct page and expose a registration function. This new mechanism is leveraged here. The module registers its memory region with the kernel MM for ECC handling using the register_pfn_address_space() registration API exposed by the kernel. It also defines a failure callback function pfn_memory_failure() to get the poisoned PFN from the MM. The module track poisoned PFN using a hastable. The PFN is communicated by the kernel MM to the module through the failure function, which push the appropriate memory offset to the hashtable. The module also defines a VMA fault ops for the module. It returns VM_FAULT_HWPOISON in case the memory offset is found in the hashtable. [1] https://lore.kernel.org/all/20231114081611.30550-1-ankita@nvidia.com/ Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 2fae9af2dad88e486e0e237b2929d53680476691 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit d9c50d246b9f942371dfcfa7da530a95b14d6f78 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/main.c | 147 +++++++++++++++++++++++++++- drivers/vfio/vfio_main.c | 3 +- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index e5ac39c4cc6b6..3d4ec6b308062 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -7,6 +7,18 @@ #include #include #include +#include + +#ifdef CONFIG_MEMORY_FAILURE +#include +#include +#include +#endif + +struct h_node { + unsigned long mem_offset; + struct hlist_node node; +}; /* * The device memory usable to the workloads running in the VM is cached @@ -47,6 +59,10 @@ struct mem_region { void *memaddr; void __iomem *ioaddr; }; /* Base virtual address of the region */ +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; + DECLARE_HASHTABLE(htbl, 8); +#endif }; struct nvgrace_gpu_pci_core_device { @@ -60,6 +76,97 @@ struct nvgrace_gpu_pci_core_device { bool has_mig_hw_bug; }; +#ifdef CONFIG_MEMORY_FAILURE +static void +nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct mem_region *region = container_of(pfn_space, + struct mem_region, pfn_address_space); + unsigned long mem_offset = pfn - pfn_space->node.start; + struct h_node *ecc; + + if (mem_offset >= region->memlength) + return; + + /* + * MM has called to notify a poisoned page. Track that in the hastable. + */ + ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); + ecc->mem_offset = mem_offset; + hash_add(region->htbl, &ecc->node, ecc->mem_offset); +} + +struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = { + .failure = nvgrace_gpu_vfio_pci_pfn_memory_failure, +}; + +static int +nvgrace_gpu_vfio_pci_register_pfn_range(struct mem_region *region, + struct vm_area_struct *vma) +{ + unsigned long nr_pages; + int ret = 0; + + nr_pages = region->memlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = vma->vm_pgoff; + region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + region->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops; + region->pfn_address_space.mapping = vma->vm_file->f_mapping; + + ret = register_pfn_address_space(®ion->pfn_address_space); + + return ret; +} + +extern struct vfio_device *vfio_device_from_file(struct file *file); + +static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff; + struct vfio_device *core_vdev; + struct nvgrace_gpu_pci_core_device *nvdev; + struct h_node *cur; + + if (!(vmf->vma->vm_file)) + goto error_exit; + + core_vdev = vfio_device_from_file(vmf->vma->vm_file); + + if (!core_vdev) + goto error_exit; + + nvdev = container_of(core_vdev, + struct nvgrace_gpu_pci_core_device, + core_device.vdev); + + /* + * Check if the page is poisoned. + */ + if (mem_offset < (nvdev->resmem.memlength >> PAGE_SHIFT)) { + hash_for_each_possible(nvdev->resmem.htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + + if (mem_offset < (nvdev->usemem.memlength >> PAGE_SHIFT)) { + hash_for_each_possible(nvdev->usemem.htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + +error_exit: + return VM_FAULT_ERROR; +} + +static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = { + .fault = nvgrace_gpu_vfio_pci_fault, +}; +#endif + static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) { struct nvgrace_gpu_pci_core_device *nvdev = @@ -127,6 +234,10 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->remap_lock); +#ifdef CONFIG_MEMORY_FAILURE + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); +#endif vfio_pci_core_close_device(core_vdev); } @@ -202,7 +313,16 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, vma->vm_pgoff = start_pfn; - return 0; +#ifdef CONFIG_MEMORY_FAILURE + vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; + + if (index == VFIO_PCI_BAR2_REGION_INDEX) + ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma); + else + ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma); +#endif + + return ret; } static long @@ -969,6 +1089,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, if (ret) goto out_put_vdev; +#ifdef CONFIG_MEMORY_FAILURE + /* + * Initialize the hashtable tracking the poisoned pages. + */ + hash_init(nvdev->resmem.htbl); + hash_init(nvdev->usemem.htbl); +#endif return ret; out_put_vdev: @@ -980,6 +1107,24 @@ static void nvgrace_gpu_remove(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); +#ifdef CONFIG_MEMORY_FAILURE + struct h_node *cur; + unsigned long bkt; + struct hlist_node *tmp_node; + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); + hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) { + hash_del(&cur->node); + vfree(cur); + } + + hash_for_each_safe(nvdev->usemem.htbl, bkt, tmp_node, cur, node) { + hash_del(&cur->node); + vfree(cur); + } +#endif + vfio_pci_core_unregister_device(core_device); vfio_put_device(&core_device->vdev); } diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 1fd261efc582d..a9081ffd7a8ca 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1365,7 +1365,7 @@ const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -static struct vfio_device *vfio_device_from_file(struct file *file) +struct vfio_device *vfio_device_from_file(struct file *file) { struct vfio_device_file *df = file->private_data; @@ -1373,6 +1373,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file) return NULL; return df->device; } +EXPORT_SYMBOL_GPL(vfio_device_from_file); /** * vfio_file_is_valid - True if the file is valid vfio file From e9da60b76025eec920a5efb5ed32fcf16f889c54 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 4 Dec 2023 22:38:25 +0000 Subject: [PATCH 120/147] NVIDIA: SAUCE: arm64: configs: Build NVGRACE_GPU_VFIO_PCI as LKM Signed-off-by: Nicolin Chen Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 9433fd4ac5f0d1a63feed968a8b16261fcd7d808 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit a1bdf88a26695bd4a255bdad7c263fe9d6d2ab58 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 1f25423de3833..dacb27235f854 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1762,3 +1762,4 @@ CONFIG_CORESIGHT_STM=m CONFIG_CORESIGHT_CPU_DEBUG=m CONFIG_CORESIGHT_CTI=m CONFIG_MEMTEST=y +CONFIG_NVGRACE_GPU_VFIO_PCI=m From 238c186d8faa13216f729dd59c57c7740a36174d Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Tue, 7 Nov 2023 04:07:47 -0800 Subject: [PATCH 121/147] NVIDIA: SAUCE: arm64: configs: Enable IOMMUFD and VFIO_DEVICE_CDEV Signed-off-by: Nicolin Chen Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 3eff6df2e892f9ea4a564ac27ae8fc005ad054be https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 6c6e8936e0f502f9225ad38070f90d259266ffb8 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- arch/arm64/configs/defconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index dacb27235f854..a876c2bd77214 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1763,3 +1763,8 @@ CONFIG_CORESIGHT_CPU_DEBUG=m CONFIG_CORESIGHT_CTI=m CONFIG_MEMTEST=y CONFIG_NVGRACE_GPU_VFIO_PCI=m +CONFIG_VFIO_DEVICE_CDEV=y +CONFIG_FAULT_INJECTION=y +CONFIG_IOMMUFD_DRIVER=y +CONFIG_IOMMUFD=y +CONFIG_IOMMUFD_TEST=y From 1f55e3a6a3102f7ee0a5f2a51843393c85432faf Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 22 Jan 2024 06:42:45 +0000 Subject: [PATCH 122/147] NVIDIA: SAUCE: arm64: configs: Replace VFIO_CONTAINER with IOMMUFD_VFIO_CONTAINER CONFIG_IOMMUFD_VFIO_CONTAINER is the VFIO compatible mode provided by iommufd core, to replace VFIO_IOMMU_TYPE1. Enable it instead. This might be used by VFIO mdev feature. Signed-off-by: Nicolin Chen Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 8188507 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit b0d6efbd206cd4c24ac1ef8877b5b6aa1d95303a https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- arch/arm64/configs/defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index a876c2bd77214..faee50aca678f 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1764,7 +1764,9 @@ CONFIG_CORESIGHT_CTI=m CONFIG_MEMTEST=y CONFIG_NVGRACE_GPU_VFIO_PCI=m CONFIG_VFIO_DEVICE_CDEV=y +# CONFIG_VFIO_CONTAINER is not set CONFIG_FAULT_INJECTION=y CONFIG_IOMMUFD_DRIVER=y CONFIG_IOMMUFD=y CONFIG_IOMMUFD_TEST=y +CONFIG_IOMMUFD_VFIO_CONTAINER=y From 1bb50a4609de18514621575ff8c888df5261419f Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 29 Aug 2024 08:15:39 +0000 Subject: [PATCH 123/147] NVIDIA: SAUCE: KVM: arm64: Allow exec fault on memory mapped cacheable in VMA When the Grace Hopper/Blackwell system is setup with EGM mode in virtualization, the system memory is partitioned into two: A Host OS visible memory and a second EGM region that is not added to the host OS. The EGM region is assigned to the VM as its system memory with the QEMU VMA mapped through remap_pfn_range. Currently KVM sets up the stage-2 mapping for memory that is not added to the kernel with device properties. It thus does not allow support for execution fault on such region. Since the EGM memory is mapped through remap_pfn_range and not added to the kernel, such memory is set without execution fault support. This patch intends to update the KVM behaviour. It is an extension of the proposal [1] to make KVM determine whether a region should have NORMAL memory properties based on the VMA pgprot. The KVM behavior is changed to set a region with support of executable fault if and only if its VMA is mapped cacheable. The EGM memory is NORMAL system memory that is not added to the kernel. It is safe in terms of execution fault and is expected to display all properties of NORMAL memory. The patch enables this use case. Check QEMU VMA pgprot to check if it is mapped as Normal cacheable memory and allow exec fault. Link: https://lore.kernel.org/lkml/20230907181459.18145-2-ankita@nvidia.com [1] Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit e38eceb9688b2c39e3034a37b9ae197c9dc36f1e https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (backported from commit b6bd6da17f5a1287951a0e74424b843512cdce2d https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) [Nirmoy: s/device/s2_force_noncacheable, s/mapping_type()/FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(page_prot))] Signed-off-by: Nirmoy Das --- arch/arm64/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a582d25eb1c8b..7402419ec0163 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1472,6 +1472,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, bool s2_force_noncacheable = false, vfio_allow_any_uc = false; unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; + unsigned long mt; struct kvm *kvm = vcpu->kvm; struct vm_area_struct *vma; short vma_shift; @@ -1593,6 +1594,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, vma_pagesize = min(vma_pagesize, (long)max_map_size); } + mt = FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(vma->vm_page_prot)); + /* * Both the canonical IPA and fault IPA must be hugepage-aligned to * ensure we find the right PFN and lay down the mapping in the right @@ -1676,7 +1679,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, writable = false; } - if (exec_fault && s2_force_noncacheable) + if (exec_fault && s2_force_noncacheable && mt != MT_NORMAL) return -ENOEXEC; /* From 34549f0848c92a356cd7c37f9188b52cceeecfc1 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 29 Aug 2024 08:15:40 +0000 Subject: [PATCH 124/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Introduce module to manage EGM The Extended GPU Memory (EGM) feature enables the GPU access to the system memory across sockets and nodes. In this mode, the physical memory can be allocated for GPU usage from anywhere in a multi-node system. The feature is being extended to virtualization. EGM when enabled in the virtualization stack, the host memory is partitioned into 2: One partition for the Host OS usage, and a second EGM region. The EGM region essentially becomes the system memory of the VM. The following figure shows the memory map in the virtualization environment. |---- Sysmem ----| |--- GPU mem ---| VM Memory Map | | | | | | | | |------ EGM -----|--Host Mem----| |--- GPU mem ---| Host Memory Map The EGM region is not available to the host memory for its usage as it is not added to the kernel. Its base HPA and the length is communicated through the DSDT entries. A linear mapping between the VM IPA and system HPA is a requirement for EGM support. The EGM region is thus assigned to a VM by mapping the QEMU VMA to a linearly increasing HPA of the EGM region using remap_pfn_range(). Introduce a new nvgrace-egm helper module to nvgrace-gpu to manage the EGM/VM region for the VM. nvgrace-egm module handles the following: 1. Fetch the EGM memory properties (base HPA, length, proximity domain). 2. Create a char device that can be used as memory-backend-file by Qemu for the VM and implement file operations. The char device is /dev/egmX, where X is the PXM node ID of the EGM being mapped fetched in 1. 3. Zero the EGM memory on first device open(). 4. Map the QEMU VMA to the EGM region using remap_pfn_range. 5. Cleaning up state and destroying the chardev on device unbind. Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 892ac2417c614969ff215ad75c0249af6073ffb9 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 3a1b8196060afeaec7b37a1300706d59642e8212 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/Kconfig | 11 ++ drivers/vfio/pci/nvgrace-gpu/Makefile | 3 + drivers/vfio/pci/nvgrace-gpu/egm.c | 235 ++++++++++++++++++++++++++ drivers/vfio/pci/nvgrace-gpu/egm.h | 12 ++ drivers/vfio/pci/nvgrace-gpu/main.c | 35 +++- 5 files changed, 292 insertions(+), 4 deletions(-) create mode 100644 drivers/vfio/pci/nvgrace-gpu/egm.c create mode 100644 drivers/vfio/pci/nvgrace-gpu/egm.h diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig index a7f624b37e410..d5773bbd22f5e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/Kconfig +++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig @@ -1,8 +1,19 @@ # SPDX-License-Identifier: GPL-2.0-only +config NVGRACE_EGM + tristate "EGM driver for NVIDIA Grace Hopper and Blackwell Superchip" + depends on ARM64 || (COMPILE_TEST && 64BIT) + help + Extended GPU Memory (EGM) support for the GPU in the NVIDIA Grace + based chips required to avail the CPU memory as additional + cross-node/cross-socket memory for GPU using KVM/qemu. + + If you don't know what to do here, say N. + config NVGRACE_GPU_VFIO_PCI tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip" depends on ARM64 || (COMPILE_TEST && 64BIT) select VFIO_PCI_CORE + select NVGRACE_EGM help VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is required to assign the GPU device to userspace using KVM/qemu/etc. diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile index 3ca8c187897a9..c99b04a94e770 100644 --- a/drivers/vfio/pci/nvgrace-gpu/Makefile +++ b/drivers/vfio/pci/nvgrace-gpu/Makefile @@ -1,3 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o nvgrace-gpu-vfio-pci-y := main.o + +obj-$(CONFIG_NVGRACE_EGM) += nvgrace-egm.o +nvgrace-egm-y := egm.o diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c new file mode 100644 index 0000000000000..f3c22a9dfecb9 --- /dev/null +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include "egm.h" + +#define MAX_EGM_NODES 256 + +struct egm_region { + struct list_head list; + int egmpxm; + atomic_t open_count; + phys_addr_t egmphys; + size_t egmlength; + struct device device; + struct cdev cdev; +}; + +static dev_t dev; +static struct class *class; +static struct list_head egm_list; + +static int nvgrace_egm_open(struct inode *inode, struct file *file) +{ + void *memaddr; + struct egm_region *region = container_of(inode->i_cdev, + struct egm_region, cdev); + + if (!region) + return -EINVAL; + + if (atomic_inc_return(®ion->open_count) > 1) + return 0; + + memaddr = memremap(region->egmphys, region->egmlength, MEMREMAP_WB); + if (!memaddr) { + atomic_dec(®ion->open_count); + return -EINVAL; + } + + memset((u8 *)memaddr, 0, region->egmlength); + memunmap(memaddr); + file->private_data = region; + + return 0; +} + +static int nvgrace_egm_release(struct inode *inode, struct file *file) +{ + struct egm_region *region = container_of(inode->i_cdev, + struct egm_region, cdev); + + if (!region) + return -EINVAL; + + if (atomic_dec_and_test(®ion->open_count)) + file->private_data = NULL; + + return 0; +} + +static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma) +{ + int ret = 0; + struct egm_region *region = file->private_data; + + if (!region) + return -EINVAL; + + ret = remap_pfn_range(vma, vma->vm_start, + PHYS_PFN(region->egmphys), + (vma->vm_end - vma->vm_start), + vma->vm_page_prot); + return ret; +} + +static const struct file_operations file_ops = { + .owner = THIS_MODULE, + .open = nvgrace_egm_open, + .release = nvgrace_egm_release, + .mmap = nvgrace_egm_mmap, +}; + +static int setup_egm_chardev(struct egm_region *region) +{ + int ret = 0; + + device_initialize(®ion->device); + + /* + * Use the proximity domain number as the device minor + * number. So the EGM corresponding to node X would be + * /dev/egmX. + */ + region->device.devt = MKDEV(MAJOR(dev), region->egmpxm); + region->device.class = class; + cdev_init(®ion->cdev, &file_ops); + region->cdev.owner = THIS_MODULE; + + ret = dev_set_name(®ion->device, "egm%d", region->egmpxm); + if (ret) + return ret; + + ret = cdev_device_add(®ion->cdev, ®ion->device); + + return ret; +} + +static int +nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, + u64 *pegmlength, u64 *pegmpxm) +{ + int ret; + + /* + * The memory information is present in the system ACPI tables as DSD + * properties nvidia,egm-base-pa and nvidia,egmm-size. + */ + ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size", + pegmlength); + if (ret) + return ret; + + if (*pegmlength > type_max(size_t)) + return -EOVERFLOW; + + ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa", + pegmphys); + if (ret) + return ret; + + if (*pegmphys > type_max(phys_addr_t)) + return -EOVERFLOW; + + ret = device_property_read_u64(&pdev->dev, "nvidia,egm-pxm", + pegmpxm); + + if (*pegmpxm > type_max(phys_addr_t)) + return -EOVERFLOW; + + return ret; +} + +int register_egm_node(struct pci_dev *pdev) +{ + struct egm_region *region = NULL; + u64 egmphys, egmlength, egmpxm; + int ret; + + ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength, &egmpxm); + if (ret) + return ret; + + list_for_each_entry(region, &egm_list, list) { + if (region->egmphys == egmphys) + return 0; + } + + region = kvzalloc(sizeof(*region), GFP_KERNEL); + region->egmphys = egmphys; + region->egmlength = egmlength; + region->egmpxm = egmpxm; + + atomic_set(®ion->open_count, 0); + + list_add_tail(®ion->list, &egm_list); + + setup_egm_chardev(region); + + return 0; +} +EXPORT_SYMBOL_GPL(register_egm_node); + +static void destroy_egm_chardev(struct egm_region *region) +{ + cdev_device_del(®ion->cdev, ®ion->device); +} + +void unregister_egm_node(int egm_node) +{ + struct egm_region *region, *temp_region; + + list_for_each_entry_safe(region, temp_region, &egm_list, list) { + if (egm_node == region->egmpxm) { + destroy_egm_chardev(region); + list_del(®ion->list); + } + } +} +EXPORT_SYMBOL_GPL(unregister_egm_node); + +static char *egm_devnode(const struct device *device, umode_t *mode) +{ + if (mode) + *mode = 0600; + + return NULL; +} + +static int __init nvgrace_egm_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&dev, + 0, MAX_EGM_NODES, "egm"); + if (ret < 0) + return ret; + + class = class_create("egm"); + if (IS_ERR(class)) { + unregister_chrdev_region(dev, MAX_EGM_NODES); + return PTR_ERR(class); + } + + class->devnode = egm_devnode; + + INIT_LIST_HEAD(&egm_list); + + return 0; +} + +static void __exit nvgrace_egm_cleanup(void) +{ + class_destroy(class); + unregister_chrdev_region(dev, MAX_EGM_NODES); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Ankit Agrawal "); +MODULE_DESCRIPTION("NVGRACE EGM - Helper module of NVGRACE GPU to support Extended GPU Memory"); + +module_init(nvgrace_egm_init); +module_exit(nvgrace_egm_cleanup); diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.h b/drivers/vfio/pci/nvgrace-gpu/egm.h new file mode 100644 index 0000000000000..28cc59e04a0b0 --- /dev/null +++ b/drivers/vfio/pci/nvgrace-gpu/egm.h @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef NVGRACE_EGM_H +#define NVGRACE_EGM_H + +int register_egm_node(struct pci_dev *pdev); +void unregister_egm_node(int egm_node); + +#endif /* NVGRACE_EGM_H */ diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 3d4ec6b308062..62bdfbac2a75c 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -8,6 +8,7 @@ #include #include #include +#include "egm.h" #ifdef CONFIG_MEMORY_FAILURE #include @@ -74,8 +75,11 @@ struct nvgrace_gpu_pci_core_device { /* Lock to control device memory kernel mapping */ struct mutex remap_lock; bool has_mig_hw_bug; + int egm_node; }; +static bool egm_enabled; + #ifdef CONFIG_MEMORY_FAILURE static void nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space, @@ -877,6 +881,13 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, return ret; } +static int +nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm) +{ + return device_property_read_u64(&pdev->dev, "nvidia,egm-pxm", + pegmpxm); +} + static int nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, struct nvgrace_gpu_pci_core_device *nvdev, @@ -1055,6 +1066,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops; struct nvgrace_gpu_pci_core_device *nvdev; u64 memphys, memlength; + u64 egmpxm; int ret; ret = nvgrace_gpu_wait_device_ready(pdev); @@ -1062,9 +1074,14 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, return ret; ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); - if (!ret) + if (!ret) { ops = &nvgrace_gpu_pci_ops; + ret = nvgrace_gpu_has_egm_property(pdev, &egmpxm); + if (!ret) + egm_enabled = true; + } + nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev, &pdev->dev, ops); if (IS_ERR(nvdev)) @@ -1083,6 +1100,12 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, memphys, memlength); if (ret) goto out_put_vdev; + + if (egm_enabled) { + register_egm_node(pdev); + nvdev->egm_node = egmpxm; + } + } ret = vfio_pci_core_register_device(&nvdev->core_device); @@ -1096,6 +1119,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, hash_init(nvdev->resmem.htbl); hash_init(nvdev->usemem.htbl); #endif + return ret; out_put_vdev: @@ -1106,14 +1130,14 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, static void nvgrace_gpu_remove(struct pci_dev *pdev) { struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + struct nvgrace_gpu_pci_core_device *nvdev = + container_of(core_device, struct nvgrace_gpu_pci_core_device, + core_device); #ifdef CONFIG_MEMORY_FAILURE struct h_node *cur; unsigned long bkt; struct hlist_node *tmp_node; - struct nvgrace_gpu_pci_core_device *nvdev = - container_of(core_device, struct nvgrace_gpu_pci_core_device, - core_device); hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) { hash_del(&cur->node); vfree(cur); @@ -1125,6 +1149,9 @@ static void nvgrace_gpu_remove(struct pci_dev *pdev) } #endif + if (egm_enabled) + unregister_egm_node(nvdev->egm_node); + vfio_pci_core_unregister_device(core_device); vfio_put_device(&core_device->vdev); } From 76870b267be074df6362eeee8eed27db25fd5b52 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 29 Aug 2024 08:15:41 +0000 Subject: [PATCH 125/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Handle pages with ECC errors on the EGM It is possible for some system memory pages on the EGM to have uncorrectable ECC errors. A list of pages known with such errors (referred as retired pages) are maintained by the Host UEFI. The Host UEFI populates such list in a reserved region. It communicates the SPA of this region through a ACPI DSDT property. nvgrace-egm module is responsible to store the list of retired page offsets to be made available for usermode processes. The module: 1. Get the reserved memory region SPA and maps to it to fetch the list of bad pages. 2. Calculate the retired page offsets in the EGM and stores it. 3. Expose an ioctl to allow querying of the offsets. The ioctl is called by usermode apps such as QEMU to get the retired page offsets. The usermode apps are expected to take appropriate action to communicate the list to the VM. Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit be54641b9f3e52a471e9d02aa12723bfb47a7060 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit c4cb1930d93ac2c7bb4f0cfba0a9e3e4ff180879 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 126 +++++++++++++++++++++++++++++ include/uapi/linux/egm.h | 26 ++++++ 2 files changed, 152 insertions(+) create mode 100644 include/uapi/linux/egm.h diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index f3c22a9dfecb9..8c9ff6313e9f4 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -4,6 +4,8 @@ */ #include +#include +#include #include "egm.h" #define MAX_EGM_NODES 256 @@ -16,6 +18,12 @@ struct egm_region { size_t egmlength; struct device device; struct cdev cdev; + DECLARE_HASHTABLE(htbl, 0x10); +}; + +struct h_node { + unsigned long mem_offset; + struct hlist_node node; }; static dev_t dev; @@ -76,11 +84,80 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma) return ret; } +static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + unsigned long minsz = offsetofend(struct egm_bad_pages_list, count); + struct egm_bad_pages_list info; + void __user *uarg = (void __user *)arg; + struct egm_region *region = file->private_data; + + if (copy_from_user(&info, uarg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (!region) + return -EINVAL; + + switch (cmd) { + case EGM_BAD_PAGES_LIST: + int ret; + unsigned long bad_page_struct_size = sizeof(struct egm_bad_pages_info); + struct egm_bad_pages_info tmp; + struct h_node *cur_page; + struct hlist_node *tmp_node; + unsigned long bkt; + int count = 0, index = 0; + + hash_for_each_safe(region->htbl, bkt, tmp_node, cur_page, node) + count++; + + if (info.argsz < (minsz + count * bad_page_struct_size)) { + info.argsz = minsz + count * bad_page_struct_size; + info.count = 0; + goto done; + } else { + hash_for_each_safe(region->htbl, bkt, tmp_node, cur_page, node) { + /* + * This check fails if there was an ECC error + * after the usermode app read the count of + * bad pages through this ioctl. + */ + if (minsz + index * bad_page_struct_size >= info.argsz) { + info.argsz = minsz + index * bad_page_struct_size; + info.count = index; + goto done; + } + + tmp.offset = cur_page->mem_offset; + tmp.size = PAGE_SIZE; + + ret = copy_to_user(uarg + minsz + + index * bad_page_struct_size, + &tmp, bad_page_struct_size); + if (ret) + return ret; + index++; + } + + info.count = index; + } + break; + default: + return -EINVAL; + } + +done: + return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; +} + static const struct file_operations file_ops = { .owner = THIS_MODULE, .open = nvgrace_egm_open, .release = nvgrace_egm_release, .mmap = nvgrace_egm_mmap, + .unlocked_ioctl = nvgrace_egm_ioctl, }; static int setup_egm_chardev(struct egm_region *region) @@ -143,6 +220,45 @@ nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, return ret; } +static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, + struct egm_region *region) +{ + u64 retiredpagesphys, count; + void *memaddr; + int index; + + if (device_property_read_u64(&pdev->dev, + "nvidia,egm-retired-pages-data-base", + &retiredpagesphys)) + return; + + memaddr = memremap(retiredpagesphys, PAGE_SIZE, MEMREMAP_WB); + if (!memaddr) + return; + + count = *(u64 *)memaddr; + + hash_init(region->htbl); + + for (index = 0; index < count; index++) { + struct h_node *retired_page; + + /* + * Since the EGM is linearly mapped, the offset in the + * carveout is the same offset in the VM system memory. + * + * Calculate the offset to communicate to the usermode + * apps. + */ + retired_page = (struct h_node *)(vzalloc(sizeof(struct h_node))); + retired_page->mem_offset = *((u64 *)memaddr + index + 1) - + region->egmphys; + hash_add(region->htbl, &retired_page->node, retired_page->mem_offset); + } + + memunmap(memaddr); +} + int register_egm_node(struct pci_dev *pdev) { struct egm_region *region = NULL; @@ -165,6 +281,8 @@ int register_egm_node(struct pci_dev *pdev) atomic_set(®ion->open_count, 0); + nvgrace_egm_fetch_bad_pages(pdev, region); + list_add_tail(®ion->list, &egm_list); setup_egm_chardev(region); @@ -181,9 +299,17 @@ static void destroy_egm_chardev(struct egm_region *region) void unregister_egm_node(int egm_node) { struct egm_region *region, *temp_region; + struct h_node *cur_page; + unsigned long bkt; + struct hlist_node *temp_node; list_for_each_entry_safe(region, temp_region, &egm_list, list) { if (egm_node == region->egmpxm) { + hash_for_each_safe(region->htbl, bkt, temp_node, cur_page, node) { + hash_del(&cur_page->node); + vfree(cur_page); + } + destroy_egm_chardev(region); list_del(®ion->list); } diff --git a/include/uapi/linux/egm.h b/include/uapi/linux/egm.h new file mode 100644 index 0000000000000..8a808e45c2052 --- /dev/null +++ b/include/uapi/linux/egm.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _UAPIEGM_H +#define _UAPIEGM_H + +#define EGM_TYPE ('E') + +struct egm_bad_pages_info { + __aligned_u64 offset; + __aligned_u64 size; +}; + +struct egm_bad_pages_list { + __u32 argsz; + /* out */ + __u32 count; + /* out */ + struct egm_bad_pages_info bad_pages[]; +}; + +#define EGM_BAD_PAGES_LIST _IO(EGM_TYPE, 100) + +#endif /* _UAPIEGM_H */ From 374a46ac24207c453779d7f7804ccfe7b387ff88 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 29 Aug 2024 08:15:42 +0000 Subject: [PATCH 126/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Register EGM for runtime ECC poison errors handling The Extended GPU Memory (EGM) is mapped through remap_pfn_range() and is not backed by struct pages. Currently, memory_failure() on such region is unsupported in kernel MM. There is a proposal to handle such memory region [1]. The implementation exports APIs to register a memory region and a corresponding callback function with the kernel MM. On the occurrence of memory failure on the registered region, kernel MM calls the callback to communicate the faulting PFN. This patch registers the EGM memory and the callback function nvgrace_egm_pfn_memory_failure with the kernel MM. On memory failure, nvgrace_egm_pfn_memory_failure is triggered and the nvgrace-egm module adds the faulting PFN to the hashtable tracking retired ECC error pages. It also implements a fault VM ops to check if the access is being made to a page known with ECC errors and returns VM_FAULT_HWPOISON in such case. Link: https://lore.kernel.org/all/20231123003513.24292-1-ankita@nvidia.com/ [1] Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (backported from commit 215f34549aa64ba668f35dce70b0bb1a1200d738 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) (koba: vmalloc.h exists) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 4eba6e15aa1b2b71aa78f78c6b796479fd7e97cc https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 89 +++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 8c9ff6313e9f4..2db00de9f04f3 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -6,8 +6,14 @@ #include #include #include +#include #include "egm.h" +#ifdef CONFIG_MEMORY_FAILURE +#include +#include +#endif + #define MAX_EGM_NODES 256 struct egm_region { @@ -19,6 +25,9 @@ struct egm_region { struct device device; struct cdev cdev; DECLARE_HASHTABLE(htbl, 0x10); +#ifdef CONFIG_MEMORY_FAILURE + struct pfn_address_space pfn_address_space; +#endif }; struct h_node { @@ -30,6 +39,70 @@ static dev_t dev; static struct class *class; static struct list_head egm_list; +#ifdef CONFIG_MEMORY_FAILURE +static void +nvgrace_egm_pfn_memory_failure(struct pfn_address_space *pfn_space, + unsigned long pfn) +{ + struct egm_region *region = + container_of(pfn_space, struct egm_region, pfn_address_space); + unsigned long mem_offset = PFN_PHYS(pfn - pfn_space->node.start); + struct h_node *ecc; + + if (mem_offset >= region->egmlength) + return; + + /* + * MM has called to notify a poisoned page. Track that in the hastable. + */ + ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); + ecc->mem_offset = mem_offset; + hash_add(region->htbl, &ecc->node, ecc->mem_offset); +} + +struct pfn_address_space_ops nvgrace_egm_pas_ops = { + .failure = nvgrace_egm_pfn_memory_failure, +}; + +static int +nvgrace_egm_register_pfn_range(struct egm_region *region, + struct vm_area_struct *vma) +{ + unsigned long nr_pages = region->egmlength >> PAGE_SHIFT; + + region->pfn_address_space.node.start = vma->vm_pgoff; + region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1; + region->pfn_address_space.ops = &nvgrace_egm_pas_ops; + region->pfn_address_space.mapping = vma->vm_file->f_mapping; + + return register_pfn_address_space(®ion->pfn_address_space); +} + +static vm_fault_t nvgrace_egm_fault(struct vm_fault *vmf) +{ + unsigned long mem_offset = PFN_PHYS(vmf->pgoff - vmf->vma->vm_pgoff); + struct egm_region *region = vmf->vma->vm_file->private_data; + struct h_node *cur; + + /* + * Check if the page is poisoned. + */ + if (mem_offset < region->egmlength) { + hash_for_each_possible(region->htbl, cur, node, mem_offset) { + if (cur->mem_offset == mem_offset) + return VM_FAULT_HWPOISON; + } + } + + return VM_FAULT_ERROR; +} + +static const struct vm_operations_struct nvgrace_egm_mmap_ops = { + .fault = nvgrace_egm_fault, +}; + +#endif + static int nvgrace_egm_open(struct inode *inode, struct file *file) { void *memaddr; @@ -63,8 +136,12 @@ static int nvgrace_egm_release(struct inode *inode, struct file *file) if (!region) return -EINVAL; - if (atomic_dec_and_test(®ion->open_count)) + if (atomic_dec_and_test(®ion->open_count)) { +#ifdef CONFIG_MEMORY_FAILURE + unregister_pfn_address_space(®ion->pfn_address_space); +#endif file->private_data = NULL; + } return 0; } @@ -81,6 +158,16 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma) PHYS_PFN(region->egmphys), (vma->vm_end - vma->vm_start), vma->vm_page_prot); + if (ret) + return ret; + + vma->vm_pgoff = PHYS_PFN(region->egmphys); + +#ifdef CONFIG_MEMORY_FAILURE + vma->vm_ops = &nvgrace_egm_mmap_ops; + + ret = nvgrace_egm_register_pfn_range(region, vma); +#endif return ret; } From bd95986615b04ca130b4f15710c8fadf53a67caa Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 29 Aug 2024 18:49:03 -0700 Subject: [PATCH 127/147] NVIDIA: SAUCE: arm64: configs: Build CONFIG_NVGRACE_EGM as LKM Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 5bb23c179220ec77ac9fb2ed610618ce1a902bd4 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 7d2ea5531c96fb9acd5704b6bec20aa29ca1fd39 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index faee50aca678f..5461f7d9806a9 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1763,6 +1763,7 @@ CONFIG_CORESIGHT_CPU_DEBUG=m CONFIG_CORESIGHT_CTI=m CONFIG_MEMTEST=y CONFIG_NVGRACE_GPU_VFIO_PCI=m +CONFIG_NVGRACE_EGM=m CONFIG_VFIO_DEVICE_CDEV=y # CONFIG_VFIO_CONTAINER is not set CONFIG_FAULT_INJECTION=y From f14ee034cf497828aaf8ea1465fb26ec65937984 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 13 Oct 2024 04:53:38 +0000 Subject: [PATCH 128/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Move the egm header file to include nvgrace-egm exposes the API register_egm_node & unregister_egm_node to manage EGM (Extended GPU Memory) present on the system. To allow out-of-tree driver such as nvidia-vgpu-vfio make use of them, move the declaration to a new nvgrace-egm.h in include. Signed-off-by: Ankit Agrawal Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit bed340f2023f22192893e9121834ee3ce252edd1 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit a9616639ce81799f5b3133c47c25f8d875728f4f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 2 +- drivers/vfio/pci/nvgrace-gpu/main.c | 2 +- .../pci/nvgrace-gpu/egm.h => include/linux/nvgrace-egm.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) rename drivers/vfio/pci/nvgrace-gpu/egm.h => include/linux/nvgrace-egm.h (55%) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 2db00de9f04f3..d035f1ee5f337 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -6,8 +6,8 @@ #include #include #include +#include #include -#include "egm.h" #ifdef CONFIG_MEMORY_FAILURE #include diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 62bdfbac2a75c..82d96beb2a37a 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -8,7 +8,7 @@ #include #include #include -#include "egm.h" +#include #ifdef CONFIG_MEMORY_FAILURE #include diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.h b/include/linux/nvgrace-egm.h similarity index 55% rename from drivers/vfio/pci/nvgrace-gpu/egm.h rename to include/linux/nvgrace-egm.h index 28cc59e04a0b0..48add892aa5bf 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.h +++ b/include/linux/nvgrace-egm.h @@ -1,12 +1,12 @@ -// SPDX-License-Identifier: GPL-2.0-only +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ /* * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ -#ifndef NVGRACE_EGM_H -#define NVGRACE_EGM_H +#ifndef _NVGRACE_EGM_H +#define _NVGRACE_EGM_H int register_egm_node(struct pci_dev *pdev); void unregister_egm_node(int egm_node); -#endif /* NVGRACE_EGM_H */ +#endif /* _NVGRACE_EGM_H */ From e7c565b284914b82a1fd3dd308af18fa60f3cca7 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 15:06:57 -0800 Subject: [PATCH 129/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Free region memory during unregistration Free the kmalloc'd region when the EGM is unregistered. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit fc592b9b4f8b455205abd2b2395671a831bb942e https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit f24760ccecb8c5517fca6791082ab89cf94b9f9f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index d035f1ee5f337..1f8f0f7b22490 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -399,6 +399,7 @@ void unregister_egm_node(int egm_node) destroy_egm_chardev(region); list_del(®ion->list); + kfree(region); } } } From 2fa8d2392805d9284f156c7e3eac7804756422a7 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 15:38:11 -0800 Subject: [PATCH 130/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Move region hash initialization Move region hash initiaization alongside the other region initialization statements to avoid situations where the hash table was not properly initialized. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 8021c1d2b1c73015102bc69eda0029114989dd1f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit e1264a62e8841fd5332f7f02a921242ff1b51dfa https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 1f8f0f7b22490..463d1dd3e9c00 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -325,8 +325,6 @@ static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, count = *(u64 *)memaddr; - hash_init(region->htbl); - for (index = 0; index < count; index++) { struct h_node *retired_page; @@ -366,6 +364,7 @@ int register_egm_node(struct pci_dev *pdev) region->egmlength = egmlength; region->egmpxm = egmpxm; + hash_init(region->htbl); atomic_set(®ion->open_count, 0); nvgrace_egm_fetch_bad_pages(pdev, region); From c42d504bc504eaea9273a34e9ebe7bbae24ac023 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 15:48:47 -0800 Subject: [PATCH 131/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Handle and convey EGM registration errors Update error handling within EGM regiration routine to catch and return errors to the caller. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit a57210c88c1c3693a24684c967c0858d75cabd32 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit a706ff8c445abed002e0b9493dfc9c664b1ffd57 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 463d1dd3e9c00..d6d4d8d5d7f13 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -360,6 +360,9 @@ int register_egm_node(struct pci_dev *pdev) } region = kvzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + region->egmphys = egmphys; region->egmlength = egmlength; region->egmpxm = egmpxm; @@ -369,11 +372,16 @@ int register_egm_node(struct pci_dev *pdev) nvgrace_egm_fetch_bad_pages(pdev, region); - list_add_tail(®ion->list, &egm_list); + ret = setup_egm_chardev(region); + if (ret) + goto err; - setup_egm_chardev(region); + list_add_tail(®ion->list, &egm_list); return 0; +err: + kfree(region); + return ret; } EXPORT_SYMBOL_GPL(register_egm_node); From 672584fb21a5c2182e97135344d841383a1ccec8 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 15:55:58 -0800 Subject: [PATCH 132/147] NVIDIA: SAUCE: vfio/nvgrace-gpu: Handle EGM registration failure Detect and handle a failure from the EGM registration service. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit f18eee3bbdea77a9b525c0665d7ebe1992bb00b2 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 8371b68c33cc03a7ea6dfd7bdfc0fe9d47ec64fb https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/main.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 82d96beb2a37a..0187883203867 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -1102,7 +1102,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, goto out_put_vdev; if (egm_enabled) { - register_egm_node(pdev); + ret = register_egm_node(pdev); + if (ret) + goto out_put_vdev; + nvdev->egm_node = egmpxm; } @@ -1110,7 +1113,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, ret = vfio_pci_core_register_device(&nvdev->core_device); if (ret) - goto out_put_vdev; + goto out_egm_unreg; #ifdef CONFIG_MEMORY_FAILURE /* @@ -1122,6 +1125,9 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, return ret; +out_egm_unreg: + if (egm_enabled) + unregister_egm_node(nvdev->egm_node); out_put_vdev: vfio_put_device(&nvdev->core_device.vdev); return ret; From 57b69bde81d559cbf721307e3da10df7e5184ed4 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 15:58:54 -0800 Subject: [PATCH 133/147] NVIDIA: SAUCE: vfio/nvgrace-gpu: Address checkpatch warnings Fix source to resolve checkpatch warnings Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit c7b47b76aefd7a8fd06a699937df6ec1e6a06a9b https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit dfa0e06b59c08dcc69b4fcc973e53e95e9e8e827 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 0187883203867..786ed41d6d017 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -1144,6 +1144,7 @@ static void nvgrace_gpu_remove(struct pci_dev *pdev) struct h_node *cur; unsigned long bkt; struct hlist_node *tmp_node; + hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) { hash_del(&cur->node); vfree(cur); From d400cc5ca074de59fa335278987478eb82a9b4b5 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 16:07:26 -0800 Subject: [PATCH 134/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Address sparse errors Fix minor syntax errors from sparse. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit bbb64e63a0b5e8c8eeec52b1e901745ba64b96d3 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit fe7819421a04be2b2405376da3550beb03986b6c https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index d6d4d8d5d7f13..ae76f958bf784 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -189,6 +189,7 @@ static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long switch (cmd) { case EGM_BAD_PAGES_LIST: + { int ret; unsigned long bad_page_struct_size = sizeof(struct egm_bad_pages_info); struct egm_bad_pages_info tmp; @@ -231,6 +232,7 @@ static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long info.count = index; } break; + } default: return -EINVAL; } From 88cb28758ad19431f91ed9b4756b79f508b40524 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 20:03:50 -0800 Subject: [PATCH 135/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Address smatch errors Return the intended errno upon a copyout fault, remove unnecessary checks following container_of pointer derivation, and use the correct macro and types for overflow checking. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 429910b6fba450a9831f590d9622d16b79006311 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit bda63f340176a3a610a64512176f0e133b9efd9f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index ae76f958bf784..858c1400e0cc8 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -109,9 +109,6 @@ static int nvgrace_egm_open(struct inode *inode, struct file *file) struct egm_region *region = container_of(inode->i_cdev, struct egm_region, cdev); - if (!region) - return -EINVAL; - if (atomic_inc_return(®ion->open_count) > 1) return 0; @@ -133,9 +130,6 @@ static int nvgrace_egm_release(struct inode *inode, struct file *file) struct egm_region *region = container_of(inode->i_cdev, struct egm_region, cdev); - if (!region) - return -EINVAL; - if (atomic_dec_and_test(®ion->open_count)) { #ifdef CONFIG_MEMORY_FAILURE unregister_pfn_address_space(®ion->pfn_address_space); @@ -225,7 +219,7 @@ static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long index * bad_page_struct_size, &tmp, bad_page_struct_size); if (ret) - return ret; + return -EFAULT; index++; } @@ -289,7 +283,7 @@ nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, if (ret) return ret; - if (*pegmlength > type_max(size_t)) + if (overflows_type(*pegmlength, size_t)) return -EOVERFLOW; ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa", @@ -297,13 +291,13 @@ nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, if (ret) return ret; - if (*pegmphys > type_max(phys_addr_t)) + if (overflows_type(*pegmphys, phys_addr_t)) return -EOVERFLOW; ret = device_property_read_u64(&pdev->dev, "nvidia,egm-pxm", pegmpxm); - if (*pegmpxm > type_max(phys_addr_t)) + if (overflows_type(*pegmpxm, int)) return -EOVERFLOW; return ret; From e0e3db67b44d30cd33a73636c8bad917a880d7bb Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 7 Nov 2024 20:09:38 -0800 Subject: [PATCH 136/147] NVIDIA: SAUCE: vfio/nvgrace-gpu: Address smatch errors Use the correct macro and types for overflow checking. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit afa8f63898cf65cf0d9cf3209ef486daa11a42c7 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit d110330e4b93894a6234b6c4036f8422883fab90 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 786ed41d6d017..507c7bd918be1 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -860,7 +860,7 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, if (ret) return ret; - if (*pmemphys > type_max(phys_addr_t)) + if (overflows_type(*pmemphys, phys_addr_t)) return -EOVERFLOW; ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size", @@ -868,7 +868,7 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev, if (ret) return ret; - if (*pmemlength > type_max(size_t)) + if (overflows_type(*pmemlength, size_t)) return -EOVERFLOW; /* From b5d70ae440d4d018d56c6c7785cb0ecb809af30f Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Fri, 22 Nov 2024 15:48:10 -0800 Subject: [PATCH 137/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Ensure ACPI value reads are successful Ensure ACPI table reads are successful prior to using the value. Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit b2947b075de6c887660cc8bc23ab5f0b6e7bfd17 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 92583550c3b22d1d00bfc6f59f3fc943cbd3a29e https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 858c1400e0cc8..a4986c33682ae 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -296,11 +296,13 @@ nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, ret = device_property_read_u64(&pdev->dev, "nvidia,egm-pxm", pegmpxm); + if (ret) + return ret; if (overflows_type(*pegmpxm, int)) return -EOVERFLOW; - return ret; + return 0; } static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, From 2d6a975a1a199acdb58385223c68eece6f7f4860 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 14 Nov 2024 08:12:22 -0800 Subject: [PATCH 138/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Avoid invalid retired pages base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some environments may provide a "nvidia,egm-retired-pages-data-base” but fail to populate it with a base address, leaving it NULL. Mapping this invalid value results in a synchronous exception when the region is first touched. Detect a NULL value, generate a warning to draw attention to the firmware bug, and return without mapping. INFO: th500_ras_intr_handler: External Abort reason=1 syndrome=0x92000410 flags=0x1 [ 82.104493] Internal error: synchronous external abort: 0000000096000410 [#1] SMP [ 82.114898] Modules linked in: nvgrace_gpu_vfio_pci(E) nvgrace_egm(E) [ 82.257218] CPU: 0 PID: 10 Comm: kworker/0:1 Tainted: G OE 6.8.12+ #5 [ 82.265135] Hardware name: NVIDIA GH200 P5042, BIOS 24103110 20241031 [ 82.271720] Workqueue: events work_for_cpu_fn [ 82.276180] pstate: 03400009 (nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--) [ 82.283298] pc : register_egm_node+0x2cc/0x440 [nvgrace_egm] [ 82.289087] lr : register_egm_node+0x2c4/0x440 [nvgrace_egm] [ 82.294872] sp : ffff8000802ebc30 [ 82.298254] x29: ffff8000802ebc60 x28: 00000000000000ff x27: 0000000000000000 [ 82.305550] x26: ffff000087a320c8 x25: ffff0000a5700000 x24: ffff000087a32000 [ 82.312846] x23: ffffa77cd758e368 x22: 0000000000000000 x21: ffffa77cd758c640 [ 82.320141] x20: ffffa77cd758e170 x19: ffff800081e7d000 x18: ffff800080293038 [ 82.327437] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 [ 82.334732] x14: 0000000000000000 x13: 65203a65646f6e5f x12: 0000000000000000 [ 82.342027] x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 [ 82.349322] x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000 [ 82.356618] x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000000 [ 82.363913] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff800081e7d000 [ 82.371210] Call trace: [ 82.373705] register_egm_node+0x2cc/0x440 [nvgrace_egm] [ 82.379135] nvgrace_gpu_probe+0x2ac/0x528 [nvgrace_gpu_vfio_pci] [ 82.385366] local_pci_probe+0x4c/0xe0 [ 82.389198] work_for_cpu_fn+0x28/0x58 [ 82.393026] process_one_work+0x168/0x3f0 [ 82.397123] worker_thread+0x360/0x480 [ 82.400952] kthread+0x11c/0x128 [ 82.404248] ret_from_fork+0x10/0x20 [ 82.407906] Code: d2820001 940002b3 aa0003f3 b4fffac0 (f9400017) [ 82.414134] ---[ end trace 0000000000000000 ]--- Signed-off-by: Matthew R. Ochs Acked-by: Kai-Heng Feng Acked-by: Carol L. Soto Acked-by: Koba Ko Signed-off-by: Matthew R. Ochs (cherry picked from commit 7ba29302925c6f2e1b9825d06f7468acc175ab85 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.8-next) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 349fb1c23faef926f3bdbc479b088c7b6b66853f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index a4986c33682ae..3bde332803647 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -317,6 +317,10 @@ static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, &retiredpagesphys)) return; + /* Catch firmware bug and avoid a crash */ + if (WARN_ON_ONCE(retiredpagesphys == 0)) + return; + memaddr = memremap(retiredpagesphys, PAGE_SIZE, MEMREMAP_WB); if (!memaddr) return; From 360a6f61bb932438d2fad79a06848a19ef252475 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Thu, 23 Jan 2025 12:07:12 -0800 Subject: [PATCH 139/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Update EGM unregistration API In an effort to simplify the programming model, use a symmetrical model for the the EGM regsiration APIs. This avoids the caller needing to keep a cookie or even have knowlege of if EGM is supported. Update the EGM unregisration API to use the PCI device as its parameter. Signed-off-by: Matthew R. Ochs (cherry picked from commit d8903ecbf6ae94cbf67b8492996021cd2488033c https://github.com/nvmochs/NV-Kernels/tree/vegm_01232025) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 5839fc506349c858a90a19e713c46fce025b2ec6 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 10 ++++++++-- drivers/vfio/pci/nvgrace-gpu/main.c | 4 ++-- include/linux/nvgrace-egm.h | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 3bde332803647..7f49cb7e147e5 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -392,15 +392,21 @@ static void destroy_egm_chardev(struct egm_region *region) cdev_device_del(®ion->cdev, ®ion->device); } -void unregister_egm_node(int egm_node) +void unregister_egm_node(struct pci_dev *pdev) { struct egm_region *region, *temp_region; struct h_node *cur_page; unsigned long bkt; struct hlist_node *temp_node; + u64 egmphys, egmlength, egmpxm; + int ret; + + ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength, &egmpxm); + if (ret) + return; list_for_each_entry_safe(region, temp_region, &egm_list, list) { - if (egm_node == region->egmpxm) { + if (egmpxm == region->egmpxm) { hash_for_each_safe(region->htbl, bkt, temp_node, cur_page, node) { hash_del(&cur_page->node); vfree(cur_page); diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 507c7bd918be1..53c07d219a67c 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -1127,7 +1127,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, out_egm_unreg: if (egm_enabled) - unregister_egm_node(nvdev->egm_node); + unregister_egm_node(pdev); out_put_vdev: vfio_put_device(&nvdev->core_device.vdev); return ret; @@ -1157,7 +1157,7 @@ static void nvgrace_gpu_remove(struct pci_dev *pdev) #endif if (egm_enabled) - unregister_egm_node(nvdev->egm_node); + unregister_egm_node(pdev); vfio_pci_core_unregister_device(core_device); vfio_put_device(&core_device->vdev); diff --git a/include/linux/nvgrace-egm.h b/include/linux/nvgrace-egm.h index 48add892aa5bf..4bbd383a02732 100644 --- a/include/linux/nvgrace-egm.h +++ b/include/linux/nvgrace-egm.h @@ -7,6 +7,6 @@ #define _NVGRACE_EGM_H int register_egm_node(struct pci_dev *pdev); -void unregister_egm_node(int egm_node); +void unregister_egm_node(struct pci_dev *pdev); #endif /* _NVGRACE_EGM_H */ From 1e7a7743b7a9fd4d04e2cceaeec9a95f6142b6e8 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Tue, 6 May 2025 09:38:38 -0500 Subject: [PATCH 140/147] NVIDIA: SAUCE: vfio/nvgrace-egm: track GPUs associated with the EGM regions GB200 systems could have multiple GPUs associated with an EGM region. For proper EGM functionality the host topology in terms of GPU affinity has to be replicated in the VM. Hence the EGM region structure must track the GPU devices belonging to the same socket. On the device probe, the device pci_dev struct is added to a linked list of the appropriate EGM region. Similarly on device remove, the pci_dev struct for the GPU is removed from the EGM region. Signed-off-by: Ankit Agrawal Ref: sj24: /home/nvidia/ankita/kernel_patches/0001_vfio_nvgrace-egm_track_GPUs_associated_with_the_EGM_regions.patch (koba: Enhance error handling, Remove egm_node from unregister_egm_node and move destroy_egm_chardev a little forward) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 0222c35fb26285ee1a6185ef50414093850ea352 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 68 ++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 7f49cb7e147e5..c081f7b611409 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -16,6 +16,11 @@ #define MAX_EGM_NODES 256 +struct gpu_node { + struct list_head list; + struct pci_dev *pdev; +}; + struct egm_region { struct list_head list; int egmpxm; @@ -24,6 +29,7 @@ struct egm_region { size_t egmlength; struct device device; struct cdev cdev; + struct list_head gpus; DECLARE_HASHTABLE(htbl, 0x10); #ifdef CONFIG_MEMORY_FAILURE struct pfn_address_space pfn_address_space; @@ -268,6 +274,11 @@ static int setup_egm_chardev(struct egm_region *region) return ret; } +static void destroy_egm_chardev(struct egm_region *region) +{ + cdev_device_del(®ion->cdev, ®ion->device); +} + static int nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys, u64 *pegmlength, u64 *pegmpxm) @@ -346,6 +357,32 @@ static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, memunmap(memaddr); } +static int add_gpu(struct egm_region *region, struct pci_dev *pdev) +{ + struct gpu_node *node; + + node = kvzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + node->pdev = pdev; + + list_add_tail(&node->list, ®ion->gpus); + return 0; +} + +static void remove_gpu(struct egm_region *region, struct pci_dev *pdev) +{ + struct gpu_node *node, *tmp; + + list_for_each_entry_safe(node, tmp, ®ion->gpus, list) { + if (node->pdev == pdev) { + list_del(&node->list); + kvfree(node); + } + } +} + int register_egm_node(struct pci_dev *pdev) { struct egm_region *region = NULL; @@ -356,11 +393,15 @@ int register_egm_node(struct pci_dev *pdev) if (ret) return ret; + /* Check if region already exists */ list_for_each_entry(region, &egm_list, list) { - if (region->egmphys == egmphys) - return 0; + if (region->egmphys == egmphys) { + /* Add GPU to existing region */ + return add_gpu(region, pdev); + } } + /* Create new region */ region = kvzalloc(sizeof(*region), GFP_KERNEL); if (!region) return -ENOMEM; @@ -370,28 +411,33 @@ int register_egm_node(struct pci_dev *pdev) region->egmpxm = egmpxm; hash_init(region->htbl); + INIT_LIST_HEAD(®ion->gpus); + atomic_set(®ion->open_count, 0); nvgrace_egm_fetch_bad_pages(pdev, region); ret = setup_egm_chardev(region); if (ret) - goto err; + goto err_free_region; list_add_tail(®ion->list, &egm_list); + ret = add_gpu(region, pdev); + if (ret) + goto err_remove_from_list; + return 0; -err: + +err_remove_from_list: + list_del(®ion->list); + destroy_egm_chardev(region); +err_free_region: kfree(region); return ret; } EXPORT_SYMBOL_GPL(register_egm_node); -static void destroy_egm_chardev(struct egm_region *region) -{ - cdev_device_del(®ion->cdev, ®ion->device); -} - void unregister_egm_node(struct pci_dev *pdev) { struct egm_region *region, *temp_region; @@ -407,6 +453,10 @@ void unregister_egm_node(struct pci_dev *pdev) list_for_each_entry_safe(region, temp_region, &egm_list, list) { if (egmpxm == region->egmpxm) { + remove_gpu(region, pdev); + if (!list_empty(®ion->gpus)) + break; + hash_for_each_safe(region->htbl, bkt, temp_node, cur_page, node) { hash_del(&cur_page->node); vfree(cur_page); From 4b6b81c273ec34e4bbe95733d659e87126103604 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Tue, 6 May 2025 09:39:33 -0500 Subject: [PATCH 141/147] NVIDIA: SAUCE: vfio/nvgrace-egm: list gpus through sysfs To replicate the host EGM topology in the VM in terms of the GPU affinity, the userspace need to be aware of which GPUs belong to the same socket as the EGM region. Expose the list of GPUs associated with an EGM region through sysfs. The list can be queried from the location /sys/devices/virtual/egm/egmX/gpu_devices. Signed-off-by: Ankit Agrawal Ref: sj24: /home/nvidia/ankita/kernel_patches/0002_vfio_nvgrace-egm_list_gpus_through_sysfs.patch (koba: Enchance error handling for sysfs_create_group) Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit fec2356d20f7054c0c89b1d32e7862bba34bda54 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 41 +++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index c081f7b611409..78dabd437559e 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -357,6 +357,38 @@ static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, memunmap(memaddr); } +static ssize_t gpu_devices_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct egm_region *region = + container_of(dev, struct egm_region, device); + struct gpu_node *node, *temp_node; + int len = 0; + + list_for_each_entry_safe(node, temp_node, ®ion->gpus, list) { + struct pci_dev *pdev = node->pdev; + + len += sysfs_emit_at(buf, len, "%04x:%02x:%02x.%x\n", + pci_domain_nr(pdev->bus), + pdev->bus->number, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + } + + return len; +} + +static DEVICE_ATTR_RO(gpu_devices); + +static struct attribute *attrs[] = { + &dev_attr_gpu_devices.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + static int add_gpu(struct egm_region *region, struct pci_dev *pdev) { struct gpu_node *node; @@ -423,12 +455,18 @@ int register_egm_node(struct pci_dev *pdev) list_add_tail(®ion->list, &egm_list); - ret = add_gpu(region, pdev); + ret = sysfs_create_group(®ion->device.kobj, &attr_group); if (ret) goto err_remove_from_list; + ret = add_gpu(region, pdev); + if (ret) + goto err_remove_sysfs; + return 0; +err_remove_sysfs: + sysfs_remove_group(®ion->device.kobj, &attr_group); err_remove_from_list: list_del(®ion->list); destroy_egm_chardev(region); @@ -462,6 +500,7 @@ void unregister_egm_node(struct pci_dev *pdev) vfree(cur_page); } + sysfs_remove_group(®ion->device.kobj, &attr_group); destroy_egm_chardev(region); list_del(®ion->list); kfree(region); From 2e86858ad5a95e53a59d3d8420c34bdd8258498b Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Tue, 6 May 2025 09:40:16 -0500 Subject: [PATCH 142/147] NVIDIA: SAUCE: vfio/nvgrace-egm: expose the egm size through sysfs To allocate the EGM, the userspace need to know it's size. Currently, there is no easy way for the userspace to determine that. Make nvgrace-egm expose the size through sysfs that can be queried by the userspace from /sys/devices/virtual/egm/egmX/egm_size. Signed-off-by: Ankit Agrawal Ref: sj24: /home/nvidia/ankita/kernel_patches/0003_vfio_nvgrace-egm_expose_the_egm_size_through_sysfs.patch Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit dcdcef245e8d648d38ef75f1023c7437b5639ddf https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index 78dabd437559e..b1a0ab9e35939 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -380,8 +380,19 @@ static ssize_t gpu_devices_show(struct device *dev, struct device_attribute *att static DEVICE_ATTR_RO(gpu_devices); +static ssize_t egm_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct egm_region *region = + container_of(dev, struct egm_region, device); + return sysfs_emit(buf, "0x%lx\n", region->egmlength); +} + +static DEVICE_ATTR_RO(egm_size); + static struct attribute *attrs[] = { &dev_attr_gpu_devices.attr, + &dev_attr_egm_size.attr, NULL, }; From 204a92344aea3bb27c1874129b9bcb80ceea52c4 Mon Sep 17 00:00:00 2001 From: kobakonvidia Date: Mon, 26 May 2025 16:48:35 +0000 Subject: [PATCH 143/147] NVIDIA: SAUCE: vfio/nvgrace-egm: Add null pointer checks after memory allocations Add missing null pointer checks after vzalloc() calls in the NVIDIA Grace GPU driver's EGM (External GPU Memory) handling code. This prevents potential null pointer dereferences in the memory failure handling and bad page fetching functions, providing proper error handling for allocation failures. Signed-off-by: Koba Ko Acked-by: Matthew R. Ochs Acked-by: Carol L. Soto Signed-off-by: Matthew R. Ochs (cherry picked from commit 63127e2996a244841309ee86b4535f41e2b0de1f https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/egm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c index b1a0ab9e35939..6cdcec03d03f6 100644 --- a/drivers/vfio/pci/nvgrace-gpu/egm.c +++ b/drivers/vfio/pci/nvgrace-gpu/egm.c @@ -62,6 +62,8 @@ nvgrace_egm_pfn_memory_failure(struct pfn_address_space *pfn_space, * MM has called to notify a poisoned page. Track that in the hastable. */ ecc = (struct h_node *)(vzalloc(sizeof(struct h_node))); + if (!ecc) + return; /* Silently fail on allocation error */ ecc->mem_offset = mem_offset; hash_add(region->htbl, &ecc->node, ecc->mem_offset); } @@ -349,6 +351,8 @@ static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev, * apps. */ retired_page = (struct h_node *)(vzalloc(sizeof(struct h_node))); + if (!retired_page) + continue; /* Skip this entry on allocation failure */ retired_page->mem_offset = *((u64 *)memaddr + index + 1) - region->egmphys; hash_add(region->htbl, &retired_page->node, retired_page->mem_offset); From ee4d180594065a67fe13b65263c18357e18b8f66 Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Mon, 14 Jul 2025 06:56:53 -0700 Subject: [PATCH 144/147] NVIDIA: SAUCE: arm64: configs: enable NVGRACE_EGM as module Add CONFIG_NVGRACE_EGM with policy 'm' for arm64 architecture. Signed-off-by: Nirmoy Das --- debian.nvidia-6.14/config/annotations | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/debian.nvidia-6.14/config/annotations b/debian.nvidia-6.14/config/annotations index 739111f9002e4..ac567b474d4b1 100644 --- a/debian.nvidia-6.14/config/annotations +++ b/debian.nvidia-6.14/config/annotations @@ -191,12 +191,13 @@ CONFIG_CC_HAS_KASAN_SW_TAGS policy<{'amd64': '-', 'arm64': ' CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT policy<{'amd64': '-', 'arm64': '-'}> CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT policy<{'amd64': '-', 'arm64': '-'}> CONFIG_CC_VERSION_TEXT policy<{'amd64': '"x86_64-linux-gnu-gcc-13 (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"', 'arm64': '"aarch64-linux-gnu-gcc-13 (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"'}> -CONFIG_CPUFREQ_ARCH_CUR_FREQ policy<{'amd64': 'y'}> CONFIG_DRM_PANIC_SCREEN_QR_CODE policy<{'amd64': '-', 'arm64': '-'}> +CONFIG_CPUFREQ_ARCH_CUR_FREQ policy<{'amd64': 'y'}> CONFIG_GCC_VERSION policy<{'amd64': '130300', 'arm64': '130300'}> CONFIG_HAVE_RUST policy<{'amd64': 'y', 'arm64': '-'}> CONFIG_IOMMUFD_VFIO_CONTAINER policy<{'arm64': 'y'}> CONFIG_LD_VERSION policy<{'amd64': '24200', 'arm64': '24200'}> +CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_MTD_NAND_CORE policy<{'amd64': 'm', 'arm64': 'y'}> CONFIG_NVIDIA_FFA_EC policy<{'arm64': 'y'}> CONFIG_PAHOLE_VERSION policy<{'amd64': '125', 'arm64': '125'}> From e7fe10596130e90dda88a6cc7e55453baa60a2e0 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Mon, 30 Jun 2025 13:14:04 -0700 Subject: [PATCH 145/147] NVIDIA: SAUCE: vfio/nvgrace-gpu: Avoid resmem pfn unregistration On platforms without the mig HW bug (e.g. Grace-Blackwell) there is not a requirement to create the resmem region. Accordingly, this region is not configured on these platforms, which leads to the following print when the device is closed: resource: Trying to free nonexistent resource <0x0000000000000000-0x000000000000ffff> Avoid calling unregister_pfn_address_space for resmem when the region is not being used. Fixes: 2d21b7bd040a ("vfio/nvgrace-gpu: register device memory for poison handling") Signed-off-by: Matthew R. Ochs Acked-by: Carol L. Soto Acked-by: Nirmoy Das Signed-off-by: Matthew R. Ochs (cherry picked from commit bd0187d9823f0bd46095e9406d2feb8fc5d07a75 https://github.com/NVIDIA/NV-Kernels/tree/24.04_linux-nvidia-adv-6.11-next) Signed-off-by: Nirmoy Das --- drivers/vfio/pci/nvgrace-gpu/main.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index 53c07d219a67c..749f12b51f025 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -239,7 +239,8 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) mutex_destroy(&nvdev->remap_lock); #ifdef CONFIG_MEMORY_FAILURE - unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); + if (nvdev->resmem.memlength) + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); #endif vfio_pci_core_close_device(core_vdev); @@ -320,9 +321,10 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, #ifdef CONFIG_MEMORY_FAILURE vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops; - if (index == VFIO_PCI_BAR2_REGION_INDEX) + if (index == VFIO_PCI_BAR2_REGION_INDEX) { + WARN_ON_ONCE(!nvdev->has_mig_hw_bug); ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma); - else + } else ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma); #endif From adec7f33bd3e86d7774baf16c3b9f77e7e2e1a65 Mon Sep 17 00:00:00 2001 From: "Matthew R. Ochs" Date: Fri, 25 Jul 2025 07:30:06 -0700 Subject: [PATCH 146/147] NVIDIA: [Config] Fix CONFIG_IRQ_MSI_IOMMU annotation for amd64 Commit 222675c87d3d ("irqchip: Have CONFIG_IRQ_MSI_IOMMU be selected by irqchips that need it") changed the behavior of CONFIG_IRQ_MSI_IOMMU to a dynamic selection, so it might not always be needed by amd64 builds. Signed-off-by: Matthew R. Ochs --- debian.nvidia-6.14/config/annotations | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/debian.nvidia-6.14/config/annotations b/debian.nvidia-6.14/config/annotations index ac567b474d4b1..b7df95c86bb93 100644 --- a/debian.nvidia-6.14/config/annotations +++ b/debian.nvidia-6.14/config/annotations @@ -191,14 +191,15 @@ CONFIG_CC_HAS_KASAN_SW_TAGS policy<{'amd64': '-', 'arm64': ' CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT policy<{'amd64': '-', 'arm64': '-'}> CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT policy<{'amd64': '-', 'arm64': '-'}> CONFIG_CC_VERSION_TEXT policy<{'amd64': '"x86_64-linux-gnu-gcc-13 (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"', 'arm64': '"aarch64-linux-gnu-gcc-13 (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"'}> -CONFIG_DRM_PANIC_SCREEN_QR_CODE policy<{'amd64': '-', 'arm64': '-'}> CONFIG_CPUFREQ_ARCH_CUR_FREQ policy<{'amd64': 'y'}> +CONFIG_DRM_PANIC_SCREEN_QR_CODE policy<{'amd64': '-', 'arm64': '-'}> CONFIG_GCC_VERSION policy<{'amd64': '130300', 'arm64': '130300'}> CONFIG_HAVE_RUST policy<{'amd64': 'y', 'arm64': '-'}> CONFIG_IOMMUFD_VFIO_CONTAINER policy<{'arm64': 'y'}> +CONFIG_IRQ_MSI_IOMMU policy<{'amd64': '-', 'arm64': 'y'}> CONFIG_LD_VERSION policy<{'amd64': '24200', 'arm64': '24200'}> -CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_MTD_NAND_CORE policy<{'amd64': 'm', 'arm64': 'y'}> +CONFIG_NVGRACE_EGM policy<{'arm64': 'm'}> CONFIG_NVIDIA_FFA_EC policy<{'arm64': 'y'}> CONFIG_PAHOLE_VERSION policy<{'amd64': '125', 'arm64': '125'}> CONFIG_PINCTRL_MT8901 policy<{'arm64': 'y'}> From 043885e2bf690beab5113b098c73c2868f4e42a8 Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 14 Apr 2025 15:11:23 +0800 Subject: [PATCH 147/147] software node: Prevent link creation failure from causing kobj reference count imbalance syzbot reported a uaf in software_node_notify_remove. [1] When any of the two sysfs_create_link() in software_node_notify() fails, the swnode->kobj reference count will not increase normally, which will cause swnode to be released incorrectly due to the imbalance of kobj reference count when executing software_node_notify_remove(). Increase the reference count of kobj before creating the link to avoid uaf. [1] BUG: KASAN: slab-use-after-free in software_node_notify_remove+0x1bc/0x1c0 drivers/base/swnode.c:1108 Read of size 1 at addr ffff888033c08908 by task syz-executor105/5844 Freed by task 5844: software_node_notify_remove+0x159/0x1c0 drivers/base/swnode.c:1106 device_platform_notify_remove drivers/base/core.c:2387 [inline] Fixes: 9eb59204d519 ("iommufd/selftest: Add set_dev_pasid in mock iommu") Reported-by: syzbot+2ff22910687ee0dfd48e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2ff22910687ee0dfd48e Tested-by: syzbot+2ff22910687ee0dfd48e@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250414071123.1228331-1-lizhi.xu@windriver.com Signed-off-by: Greg Kroah-Hartman (cherry picked from commit bc2c46426f2d95e58c82f394531afdd034c8706c) Signed-off-by: Nirmoy Das --- drivers/base/swnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index b1726a3515f6f..5c78fa6ae7725 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -1080,6 +1080,7 @@ void software_node_notify(struct device *dev) if (!swnode) return; + kobject_get(&swnode->kobj); ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node"); if (ret) return; @@ -1089,8 +1090,6 @@ void software_node_notify(struct device *dev) sysfs_remove_link(&dev->kobj, "software_node"); return; } - - kobject_get(&swnode->kobj); } void software_node_notify_remove(struct device *dev)