fix: pass skip_softmax_threshold_scale_factor to prefill wrapper in test

PerkzZheng · saltyminty · commit fb4c91e5e668 · 2026-04-23T10:13:59.000-07:00
The wrapper consistency check in _test_trtllm_batch_prefill was calling
wrapper_trtllm_gen.run() without skip_softmax_threshold_scale_factor,
causing it to default to None (standard attention kernel) while the raw
API used 1e-30 (skipsSoftmax kernel variant). Different cubin kernels
produce bit-different results, failing the exact-equality assert.

The decode counterpart was already fixed; this mirrors that fix for the
prefill test path.
diff --git a/tests/attention/test_trtllm_gen_attention.py b/tests/attention/test_trtllm_gen_attention.py
@@ -849,6 +849,7 @@ def _test_trtllm_batch_prefill(
             v_scale=v_scale / o_scale,
             enable_pdl=enable_pdl,
             sinks=(sink if enable_sink else None),
+            skip_softmax_threshold_scale_factor=skip_softmax_threshold_scale_factor,
         )
         # v_scale, o_scale in wrapper is emulated by multiplying output by v_scale instead of fused into kernel.
         if v_scale == o_scale == 1.0:

Original file line number	Diff line number	Diff line change
`@@ -849,6 +849,7 @@ def _test_trtllm_batch_prefill(`
`849`	`849`	`v_scale=v_scale / o_scale,`
`850`	`850`	`enable_pdl=enable_pdl,`
`851`	`851`	`sinks=(sink if enable_sink else None),`
	`852`	`+ skip_softmax_threshold_scale_factor=skip_softmax_threshold_scale_factor,`
`852`	`853`	`)`
`853`	`854`	`# v_scale, o_scale in wrapper is emulated by multiplying output by v_scale instead of fused into kernel.`
`854`	`855`	`if v_scale == o_scale == 1.0:`