fix: wan2.1

JamesBrianD · JamesBrianD · commit 19cec0129402 · 2026-03-03T17:38:40.000+08:00
diff --git a/python/sgl_jax/srt/kernels/update_kv_cache/update_kv_cache.py b/python/sgl_jax/srt/kernels/update_kv_cache/update_kv_cache.py
@@ -185,6 +185,7 @@ def kv_cache_update_impl(
 @partial(
     jax.jit,
     static_argnames=["page_size", "num_slices_per_block", "kv_partition_axis"],
+    donate_argnums=(2,),
 )
 def kv_cache_update(
     new_kv: jax.Array,  # [total_num_token, num_kv_heads, head_dim]
diff --git a/python/sgl_jax/srt/layers/attention/native_backend.py b/python/sgl_jax/srt/layers/attention/native_backend.py
@@ -92,7 +92,7 @@ def __call__(
             P("data"),  # extend_prefix_lens
             P("data"),  # extend_seq_lens
         )
-        out_specs = P("data", "tensor", None)  # attn_output
+        out_specs = P("data", "tensor")  # attn_output: [num_tokens, hidden_size]
 
         attn_output = jax.shard_map(
             lambda q_local, k_local, v_local, seq_lens_local, loc_local, prefix_lens_local, extend_lens_local: forward_attention(
diff --git a/python/sgl_jax/srt/mem_cache/memory_pool.py b/python/sgl_jax/srt/mem_cache/memory_pool.py
@@ -1,6 +1,7 @@
 import abc
 import logging
 import time
+from functools import partial
 
 import jax
 import jax.numpy as jnp
@@ -784,6 +785,7 @@ def update_fused_kv_cache_vectorized(
     by grouping contiguous tokens into page-sized chunks for efficient updates.
     """
 
+    @partial(jax.jit, donate_argnums=(2,))
     @jax.shard_map(
         in_specs=(
             # fused_kv: sharded by data (tokens) and tensor (heads)
diff --git a/python/sgl_jax/srt/models/umt5.py b/python/sgl_jax/srt/models/umt5.py
@@ -15,7 +15,6 @@
 import jax
 import jax.numpy as jnp
 from flax import nnx
-from jax.sharding import NamedSharding
 from jax.sharding import PartitionSpec as P
 from transformers import UMT5Config
 
@@ -261,48 +260,98 @@ def __call__(
 
     def _native_attention(self, q, k, v, forward_batch: ForwardBatch):
         """Native attention for encoder/cross-attention with T5 position bias."""
-        num_tokens, hidden = q.shape[0], q.shape[-1]
-        head_dim = hidden // self.n_heads
-
-        # Reshape to [heads, tokens, head_dim]
-        def to_heads(x):
-            n_tok = x.shape[0]
-            return jnp.transpose(x.reshape(n_tok, self.n_heads, head_dim), (1, 0, 2))
-
-        q_h, k_h, v_h = to_heads(q), to_heads(k), to_heads(v)
-
-        # Compute scores in float32
-        scores = jnp.einsum("hqd,hkd->hqk", q_h.astype(jnp.float32), k_h.astype(jnp.float32))
+        hidden = q.shape[-1]
+        head_dim = self.d_kv  # T5 uses d_kv as head dimension, not hidden // n_heads
+        n_heads = self.n_heads  # Capture as local variable for closure
+        is_cross_attn = self.is_cross_attention  # Capture as local variable
+        has_rel_bias = hasattr(self, "rel_bias")  # Capture as local variable
+
+        # Debug: print dimensions
+        jax.debug.print(
+            "UMT5 _native_attention: q.shape={q_shape}, hidden={hidden}, d_kv={d_kv}, n_heads={n_heads}, inner_dim={inner_dim}",
+            q_shape=q.shape,
+            hidden=hidden,
+            d_kv=head_dim,
+            n_heads=n_heads,
+            inner_dim=self.inner_dim,
+        )
 
         # Get sequence lengths
         q_lens = getattr(forward_batch, "extend_seq_lens", forward_batch.seq_lens)
         # Fallback if seq_lens is None: assume single sequence
         if q_lens is None:
             q_lens = jnp.array([q.shape[0]], dtype=jnp.int32)
 
-        # Add position bias for self-attention (T5-specific)
-        if not self.is_cross_attention and hasattr(self, "rel_bias"):
-            pos_bias = self._compute_position_bias(q_lens, q.shape[0], k.shape[0])
-            scores = scores + pos_bias.astype(jnp.float32)
+        rel_bias_weight = self.rel_bias.embedding.value if hasattr(self, "rel_bias") else None
 
-        # Apply masking
         kv_lens = (
             getattr(forward_batch, "encoder_seq_lens", q_lens)
             if self.is_cross_attention
             else q_lens
         )
         is_causal = self.is_decoder and not self.is_cross_attention
 
-        # Apply block_diagonal_mask
-        scores = _apply_block_diagonal_mask(scores, q_lens, kv_lens, is_causal=is_causal)
+        # Wrap computation in shard_map for data parallelism
+        in_specs = (
+            P("data", "tensor"),  # q
+            P("data", "tensor"),  # k
+            P("data", "tensor"),  # v
+            P("data"),  # q_lens
+            P("data"),  # kv_lens
+            P(None, "tensor"),  # rel_bias_weight
+        )
+        out_specs = P("data", "tensor")
+
+        def _compute_attention(q_local, k_local, v_local, q_lens_local, kv_lens_local, rel_weight):
+            # Debug: print local shapes inside shard_map
+            jax.debug.print(
+                "Inside shard_map: q_local.shape={q_shape}, n_heads={n_heads}, head_dim={head_dim}",
+                q_shape=q_local.shape,
+                n_heads=n_heads,
+                head_dim=head_dim,
+            )
+            local_n_heads = q_local.shape[-1] // head_dim
+            local_hidden = q_local.shape[-1]
+
+            # Reshape to [heads, tokens, head_dim]
+            def to_heads(x):
+                n_tok = x.shape[0]
+                return jnp.transpose(x.reshape(n_tok, local_n_heads, head_dim), (1, 0, 2))
+
+            q_h, k_h, v_h = to_heads(q_local), to_heads(k_local), to_heads(v_local)
 
-        # Softmax and weighted sum
-        weights = jax.nn.softmax(scores, axis=-1)
-        out = jnp.einsum("hqk,hkd->hqd", weights, v_h.astype(jnp.float32))
+            # Compute scores in float32
+            scores = jnp.einsum("hqd,hkd->hqk", q_h.astype(jnp.float32), k_h.astype(jnp.float32))
 
-        return jnp.transpose(out, (1, 0, 2)).reshape(num_tokens, hidden)
+            # Add position bias for self-attention (T5-specific)
+            if not is_cross_attn and has_rel_bias:
+                pos_bias = self._compute_position_bias(
+                    q_lens_local, q_local.shape[0], k_local.shape[0], rel_weight
+                )
+                scores = scores + pos_bias.astype(jnp.float32)
 
-    def _compute_position_bias(self, seq_lens, q_len, k_len):
+            # Apply block_diagonal_mask
+            scores = _apply_block_diagonal_mask(
+                scores, q_lens_local, kv_lens_local, is_causal=is_causal
+            )
+
+            # Softmax and weighted sum
+            weights = jax.nn.softmax(scores, axis=-1)
+            out = jnp.einsum("hqk,hkd->hqd", weights, v_h.astype(jnp.float32))
+
+            return jnp.transpose(out, (1, 0, 2)).reshape(q_local.shape[0], local_hidden)
+
+        result = jax.shard_map(
+            _compute_attention,
+            mesh=self.mesh,
+            in_specs=in_specs,
+            out_specs=out_specs,
+            check_vma=False,
+        )(q, k, v, q_lens, kv_lens, rel_bias_weight)
+
+        return result
+
+    def _compute_position_bias(self, seq_lens, q_len, k_len, rel_weight):
         """Compute T5 position bias [heads, q_len, k_len]."""
         starts = jnp.cumsum(seq_lens) - seq_lens
         indicators = jnp.zeros(q_len, dtype=jnp.int32).at[starts].set(1)
@@ -318,7 +367,8 @@ def _compute_position_bias(self, seq_lens, q_len, k_len):
             num_buckets=self.num_buckets,
             max_distance=self.max_distance,
         )
-        return jnp.transpose(self.rel_bias(buckets), (2, 0, 1))
+        bias = rel_weight[buckets]
+        return jnp.transpose(bias, (2, 0, 1))
 
 
 # =============================================================================
@@ -467,8 +517,9 @@ def __call__(self, forward_batch: ForwardBatch, token_to_kv_pool=None, logits_me
 
         # Dummy logits for interface compatibility
         bs = forward_batch.seq_lens.shape[0]
-        dummy = jnp.zeros((bs, self.config.vocab_size), dtype=self.dtype)
-        dummy = jax.sharding.reshard(dummy, NamedSharding(self.mesh, P(None, "tensor")))
+        dummy = jnp.zeros(
+            (bs, self.config.vocab_size), dtype=self.dtype, out_sharding=("data", "tensor")
+        )
         return LogitsProcessorOutput(next_token_logits=dummy, hidden_states=hidden), [], [], None
 
 
diff --git a/python/sgl_jax/test/mem_cache/test_kv_cache.py b/python/sgl_jax/test/mem_cache/test_kv_cache.py
@@ -117,7 +117,9 @@ def test_kv_cache_update_page_size_1(self):
         total_tokens = 16
         k, v, loc, k_cache, v_cache = self.generate_test_data(total_tokens, add_padding=False)
 
-        updated_k_cache, updated_v_cache = update_kv_cache(k, v, loc, k_cache, v_cache, page_size=1)
+        updated_k_cache, updated_v_cache = update_kv_cache(
+            k, v, loc, k_cache.copy(), v_cache.copy(), page_size=1
+        )
 
         # Expected result
         expected_k_cache, expected_v_cache = self.expected_update_kv_cache(
@@ -132,7 +134,9 @@ def test_kv_cache_update_page_size_1_with_padding(self):
         total_tokens = 12
         k, v, loc, k_cache, v_cache = self.generate_test_data(total_tokens, add_padding=True)
 
-        updated_k_cache, updated_v_cache = update_kv_cache(k, v, loc, k_cache, v_cache, page_size=1)
+        updated_k_cache, updated_v_cache = update_kv_cache(
+            k, v, loc, k_cache.copy(), v_cache.copy(), page_size=1
+        )
 
         # Expected result (should ignore padding tokens where loc == -1)
         expected_k_cache, expected_v_cache = self.expected_update_kv_cache(
@@ -148,7 +152,9 @@ def test_kv_cache_update_page_size_4(self):
         k, v, loc, k_cache, v_cache = self.generate_test_data(total_tokens, add_padding=False)
 
         # Test with page_size=4
-        updated_k_cache, updated_v_cache = update_kv_cache(k, v, loc, k_cache, v_cache, page_size=4)
+        updated_k_cache, updated_v_cache = update_kv_cache(
+            k, v, loc, k_cache.copy(), v_cache.copy(), page_size=4
+        )
 
         # Expected result
         expected_k_cache, expected_v_cache = self.expected_update_kv_cache(
@@ -163,7 +169,9 @@ def test_kv_cache_update_page_size_4_with_padding(self):
         k, v, loc, k_cache, v_cache = self.generate_test_data(total_tokens, add_padding=True)
 
         # Test with page_size=4
-        updated_k_cache, updated_v_cache = update_kv_cache(k, v, loc, k_cache, v_cache, page_size=4)
+        updated_k_cache, updated_v_cache = update_kv_cache(
+            k, v, loc, k_cache.copy(), v_cache.copy(), page_size=4
+        )
 
         # Expected result (should ignore padding tokens where loc == -1)
         expected_k_cache, expected_v_cache = self.expected_update_kv_cache(
@@ -179,7 +187,9 @@ def test_kv_cache_update_page_size_8_contiguous(self):
         k, v, loc, k_cache, v_cache = self.generate_test_data(total_tokens, add_padding=False)
 
         # Test with page_size=8
-        updated_k_cache, updated_v_cache = update_kv_cache(k, v, loc, k_cache, v_cache, page_size=8)
+        updated_k_cache, updated_v_cache = update_kv_cache(
+            k, v, loc, k_cache.copy(), v_cache.copy(), page_size=8
+        )
 
         # Expected result
         expected_k_cache, expected_v_cache = self.expected_update_kv_cache(
@@ -204,7 +214,9 @@ def test_all_padding_tokens(self):
         original_v_cache = v_cache.copy()
 
         # Test both approaches
-        updated_k_cache, updated_v_cache = update_kv_cache(k, v, loc, k_cache, v_cache, page_size=8)
+        updated_k_cache, updated_v_cache = update_kv_cache(
+            k, v, loc, k_cache.copy(), v_cache.copy(), page_size=8
+        )
 
         # Cache should remain unchanged since all tokens are padding
         self.assertTrue(jnp.allclose(updated_k_cache, original_k_cache))
@@ -266,7 +278,7 @@ def test_kv_cache_update_multiple_segments_with_padding(self):
         for page_size in [1, 2, 4, 8]:
             with self.subTest(page_size=page_size):
                 updated_k_cache, updated_v_cache = update_kv_cache(
-                    k, v, loc, k_cache, v_cache, page_size=page_size
+                    k, v, loc, k_cache.copy(), v_cache.copy(), page_size=page_size
                 )
 
                 # Expected result
diff --git a/python/sgl_jax/test/mem_cache/test_swa_radix_cache.py b/python/sgl_jax/test/mem_cache/test_swa_radix_cache.py
@@ -27,7 +27,8 @@ class TestSWARadixCache(unittest.TestCase):
     def setUp(self):
         # Keep KV sizes small to make tests light-weight
         self.devices = jax.devices()
-        self.mesh = Mesh([self.devices[0]], axis_names=("tensor",))
+        # Create mesh with both 'data' and 'tensor' axes for DP compatibility
+        self.mesh = Mesh(np.array([self.devices[0]]).reshape(1, 1), axis_names=("data", "tensor"))
 
         # Small buffers to avoid heavy allocations
         self.kv_head_num = 1
diff --git a/python/sgl_jax/test/test_flashattention.py b/python/sgl_jax/test/test_flashattention.py
@@ -140,7 +140,7 @@ def create_test_data(
     causal=True,
     input_ids=None,
     model_config=None,
-    max_total_token_size=710016,
+    max_total_token_size=100000,
 ):
     """Create a real ForwardBatch for testing."""
     assert mode in ["prefill", "decode"]

Original file line number	Diff line number	Diff line change
`@@ -185,6 +185,7 @@ def kv_cache_update_impl(`
`185`	`185`	`@partial(`
`186`	`186`	`jax.jit,`
`187`	`187`	`static_argnames=["page_size", "num_slices_per_block", "kv_partition_axis"],`
	`188`	`+ donate_argnums=(2,),`
`188`	`189`	`)`
`189`	`190`	`def kv_cache_update(`
`190`	`191`	`new_kv: jax.Array, # [total_num_token, num_kv_heads, head_dim]`
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ def __call__(`
`92`	`92`	`P("data"), # extend_prefix_lens`
`93`	`93`	`P("data"), # extend_seq_lens`
`94`	`94`	`)`
`95`		`- out_specs = P("data", "tensor", None) # attn_output`
	`95`	`+ out_specs = P("data", "tensor") # attn_output: [num_tokens, hidden_size]`
`96`	`96`
`97`	`97`	`attn_output = jax.shard_map(`
`98`	`98`	`lambda q_local, k_local, v_local, seq_lens_local, loc_local, prefix_lens_local, extend_lens_local: forward_attention(`