fix: skip llm_int8_linear on V100, fix sparse_attention columns shape

liuhao2638 · claude · liuhao2638 · commit c24f8a8df1f7 · 2026-04-10T11:57:13.000Z
- test_ai_quantized_linear: Add _is_ampere_or_above() check to skip
  TestLlmInt8Linear tests on GPUs with compute capability &lt; 8.0
  (CI V100 has sm_70, cublasLtMatmul returns CUBLAS_STATUS_NOT_SUPPORTED)
- test_ai_sparse_attention: Fix columns tensor shape from total_nnz
  (B*H*S*nnz_per_row) to per_head_nnz (S*nnz_per_row), resolving
  cusparse dimension mismatch (nnz &gt; matrix_size) and segfault

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/test/ai_edited_test/test_ai_quantized_linear.py b/test/ai_edited_test/test_ai_quantized_linear.py
@@ -349,6 +349,18 @@ def test_weight_only_linear_2d_input(self):
             self.skipTest(f"Unsupported arch or CUDA error: {e}")
 
 
+def _is_ampere_or_above():
+    """Check if GPU compute capability >= 8.0 (Ampere+).
+    llm_int8_linear requires Ampere or newer architecture."""
+    if not paddle.is_compiled_with_cuda():
+        return False
+    try:
+        arch = _get_arch_info()
+        return arch >= 80
+    except (ValueError, RuntimeError):
+        return False
+
+
 class TestLlmInt8Linear(unittest.TestCase):
     """Test llm_int8_linear function.
     测试 llm_int8_linear 函数。"""
@@ -357,7 +369,8 @@ def setUp(self):
         paddle.disable_static()
 
     @unittest.skipIf(
-        not paddle.is_compiled_with_cuda(), "CUDA required for llm_int8_linear"
+        not _is_ampere_or_above(),
+        "llm_int8_linear requires Ampere+ (sm_80), skipped on CI V100 (sm_70)",
     )
     def test_llm_int8_linear_basic(self):
         """Test basic llm_int8_linear without bias.
@@ -373,7 +386,8 @@ def test_llm_int8_linear_basic(self):
             self.skipTest(f"CUDA error: {e}")
 
     @unittest.skipIf(
-        not paddle.is_compiled_with_cuda(), "CUDA required for llm_int8_linear"
+        not _is_ampere_or_above(),
+        "llm_int8_linear requires Ampere+ (sm_80), skipped on CI V100 (sm_70)",
     )
     def test_llm_int8_linear_with_bias(self):
         """Test llm_int8_linear with bias.
@@ -391,7 +405,8 @@ def test_llm_int8_linear_with_bias(self):
             self.skipTest(f"CUDA error: {e}")
 
     @unittest.skipIf(
-        not paddle.is_compiled_with_cuda(), "CUDA required for llm_int8_linear"
+        not _is_ampere_or_above(),
+        "llm_int8_linear requires Ampere+ (sm_80), skipped on CI V100 (sm_70)",
     )
     def test_llm_int8_linear_different_threshold(self):
         """Test llm_int8_linear with different threshold.
@@ -406,7 +421,8 @@ def test_llm_int8_linear_different_threshold(self):
             self.skipTest(f"CUDA error: {e}")
 
     @unittest.skipIf(
-        not paddle.is_compiled_with_cuda(), "CUDA required for llm_int8_linear"
+        not _is_ampere_or_above(),
+        "llm_int8_linear requires Ampere+ (sm_80), skipped on CI V100 (sm_70)",
     )
     def test_llm_int8_linear_high_threshold(self):
         """Test llm_int8_linear with high threshold (fewer outliers).
diff --git a/test/ai_edited_test/test_ai_sparse_attention.py b/test/ai_edited_test/test_ai_sparse_attention.py
@@ -82,7 +82,7 @@ def test_sparse_attention_output_shape(self):
             )
             # Each position attends to 2 positions
             nnz_per_row = 2
-            total_nnz = batch_size * num_heads * seq_len * nnz_per_row
+            nnz_per_head = seq_len * nnz_per_row
             offset = paddle.zeros(
                 [batch_size, num_heads, seq_len + 1], dtype="int32"
             )
@@ -91,7 +91,7 @@ def test_sparse_attention_output_shape(self):
                     for s in range(seq_len):
                         offset[b, h, s + 1] = offset[b, h, s] + nnz_per_row
             columns = paddle.zeros(
-                [batch_size, num_heads, total_nnz], dtype="int32"
+                [batch_size, num_heads, nnz_per_head], dtype="int32"
             )
             # Each position attends to itself and the next position
             for b in range(batch_size):
@@ -263,11 +263,11 @@ def test_sparse_attention_multi_head(self):
             value = paddle.randn([batch, heads, seq, dim], dtype="float32")
             # Dense pattern: each row attends to all 4 positions
             nnz_per_row = 4
+            nnz_per_head = seq * nnz_per_row
             offset = paddle.zeros([batch, heads, seq + 1], dtype="int32")
             for s in range(seq):
                 offset[0, :, s + 1] = offset[0, :, s] + nnz_per_row
-            total_nnz = batch * heads * seq * nnz_per_row
-            columns = paddle.zeros([batch, heads, total_nnz], dtype="int32")
+            columns = paddle.zeros([batch, heads, nnz_per_head], dtype="int32")
             for h in range(heads):
                 for s in range(seq):
                     base = offset[0, h, s].item()
@@ -323,11 +323,11 @@ def test_sparse_attention_different_head_dim(self):
                 key = paddle.randn([1, 2, 4, head_dim], dtype="float32")
                 value = paddle.randn([1, 2, 4, head_dim], dtype="float32")
                 nnz_per_row = 4
+                nnz_per_head = 4 * nnz_per_row
                 offset = paddle.zeros([1, 2, 5], dtype="int32")
                 for s in range(4):
                     offset[0, :, s + 1] = offset[0, :, s] + nnz_per_row
-                total_nnz = 1 * 2 * 4 * nnz_per_row
-                columns = paddle.zeros([1, 2, total_nnz], dtype="int32")
+                columns = paddle.zeros([1, 2, nnz_per_head], dtype="int32")
                 for h in range(2):
                     for s in range(4):
                         base = offset[0, h, s].item()