[RL] Support Qwen 2.5 in RL weight transfer and model registry (#2456)

AlienKevin · claude · Gemini 3 Flash · web-flow · commit d96690a99c9c · 2026-01-27T01:19:10.000Z
This PR adds support for Qwen 2.5 models in the RL pipeline. - Updates weight transfer logic and model mappings (handles bias keys and MHA/GQA differences). - Registers `Qwen2ForCausalLM` in the `tpu_inference` model registry to fix missing architecture errors. Fixes #2446 --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: Gemini 3 Flash <noreply@google.com>
diff --git a/lib/marin/src/marin/rl/environments/inference_ctx/vllm.py b/lib/marin/src/marin/rl/environments/inference_ctx/vllm.py
@@ -124,8 +124,25 @@ def _render_messages_to_tokens(self, messages: list[Message]) -> list[int]:
         """
         return self.renderer.build_generation_prompt(messages)
 
+    @staticmethod
+    def _patch_tpu_inference_registry():
+        """Register Qwen2ForCausalLM in tpu_inference if not present."""
+        try:
+            from tpu_inference.models.common import model_loader
+
+            if "Qwen2ForCausalLM" not in model_loader._MODEL_REGISTRY:
+                logger.info("Patching tpu_inference to support Qwen2ForCausalLM")
+                from tpu_inference.models.jax.qwen2 import Qwen2ForCausalLM
+
+                model_loader.register_model("Qwen2ForCausalLM", Qwen2ForCausalLM)
+        except ImportError:
+            logger.exception("Failed to patch tpu_inference registry")
+            raise
+
     @staticmethod
     def _get_llm_engine(inference_config: vLLMInferenceContextConfig):
+        vLLMInferenceContext._patch_tpu_inference_registry()
+
         if inference_config.mode == InferenceMode.SYNC:
             if LLM is None:
                 raise ImportError("vLLM is not installed. Please install it with: pip install vllm")
diff --git a/lib/marin/src/marin/rl/environments/inference_ctx/vllm_utils.py b/lib/marin/src/marin/rl/environments/inference_ctx/vllm_utils.py
@@ -66,6 +66,18 @@ def levanter_qwen_to_vllm_mapping():
         {
             "model.layers.*.self_attn.q_norm": ("model.layers.*.self_attn.q_norm.scale", (None,)),
             "model.layers.*.self_attn.k_norm": ("model.layers.*.self_attn.k_norm.scale", (None,)),
+            "model.layers.*.self_attn.q_proj_bias": (
+                "model.layers.*.self_attn.q_proj.bias",
+                ("model", None),
+            ),
+            "model.layers.*.self_attn.k_proj_bias": (
+                "model.layers.*.self_attn.k_proj.bias",
+                ("model", None),
+            ),
+            "model.layers.*.self_attn.v_proj_bias": (
+                "model.layers.*.self_attn.v_proj.bias",
+                ("model", None),
+            ),
         }
     )
     return mapping
@@ -80,9 +92,12 @@ def levanter_qwen_to_vllm_mapping():
     "k_proj": (2, 0, 1),
     "v_proj": (2, 0, 1),
     "o_proj": (1, 2, 0),
+    "q_proj_bias": (0, 1),
+    "k_proj_bias": (0, 1),
+    "v_proj_bias": (0, 1),
 }
 
-MODEL_MAPPINGS = {
+_MODEL_MAPPINGS = {
     "meta-llama/Llama-3.2-1B-Instruct": levanter_llama_to_vllm_mapping(),
     "meta-llama/Llama-3.2-3B-Instruct": levanter_llama_to_vllm_mapping(),
     "Qwen/Qwen3-0.6B": levanter_qwen_to_vllm_mapping(),
@@ -92,7 +107,7 @@ def levanter_qwen_to_vllm_mapping():
     "marin-community/marin-8b-instruct": levanter_llama_to_vllm_mapping(),
 }
 
-MODEL_TRANSPOSE_KEYS = {
+_MODEL_TRANSPOSE_KEYS = {
     "meta-llama/Llama-3.2-1B-Instruct": llama_transpose_keys,
     "meta-llama/Llama-3.2-3B-Instruct": llama_transpose_keys,
     "Qwen/Qwen3-0.6B": llama_transpose_keys,
@@ -101,3 +116,42 @@ def levanter_qwen_to_vllm_mapping():
     "Qwen/Qwen3-8B": llama_transpose_keys,
     "marin-community/marin-8b-instruct": llama_transpose_keys,
 }
+
+
+def _infer_mapping(model_name: str) -> dict:
+    """Infer the vLLM mapping for a model name, falling back to substring matching."""
+    if model_name in _MODEL_MAPPINGS:
+        return _MODEL_MAPPINGS[model_name]
+    if "Qwen2.5" in model_name:
+        return levanter_qwen_to_vllm_mapping()
+    raise KeyError(f"No MODEL_MAPPING registered for model: {model_name}")
+
+
+def _infer_transpose_keys(model_name: str) -> dict:
+    """Infer the transpose keys for a model name, falling back to substring matching."""
+    if model_name in _MODEL_TRANSPOSE_KEYS:
+        return _MODEL_TRANSPOSE_KEYS[model_name]
+    if "Qwen2.5" in model_name:
+        return llama_transpose_keys
+    raise KeyError(f"No MODEL_TRANSPOSE_KEYS registered for model: {model_name}")
+
+
+class _FallbackDict:
+    """Dict-like object that supports fallback lookup by substring matching."""
+
+    def __init__(self, fallback):
+        self._fallback = fallback
+
+    def __getitem__(self, key):
+        return self._fallback(key)
+
+    def __contains__(self, key):
+        try:
+            self._fallback(key)
+            return True
+        except KeyError:
+            return False
+
+
+MODEL_MAPPINGS = _FallbackDict(_infer_mapping)
+MODEL_TRANSPOSE_KEYS = _FallbackDict(_infer_transpose_keys)
diff --git a/lib/marin/src/marin/rl/weight_utils.py b/lib/marin/src/marin/rl/weight_utils.py
@@ -18,6 +18,18 @@
 from levanter.models.lm_model import LmHeadModel
 
 
+def _get_nnx_key_name(split_key: list[str]) -> str:
+    """
+    Determine the NNX key name from the split Levanter key.
+    If the key ends in 'bias', append '_bias' to the parameter name.
+    Otherwise (e.g. 'weight'), use the parameter name directly.
+    """
+    key_name = split_key[-2]
+    if split_key[-1] == "bias":
+        key_name = f"{key_name}_bias"
+    return key_name
+
+
 def levanter_to_nnx_state(levanter_model: LmHeadModel) -> dict:
     # The format of this state dict is flat like:
     # model.layers.0.self_attn.q_proj.weight -> jax array
@@ -46,7 +58,7 @@ def levanter_to_nnx_state(levanter_model: LmHeadModel) -> dict:
         # vLLM expects the weights to be padded to the next multiple of 128. I assume this is
         # because they want to use Pallas kernels which have this requirement.
         if "self_attn" in split_key_without_weight:
-            if "q_proj" in split_key_without_weight:
+            if "q_proj" in split_key_without_weight and len(value.shape) == 4:
                 kv_heads, q_heads_per_group, head_size, embed = value.shape
                 value = value.reshape(kv_heads * q_heads_per_group, head_size, embed)
 
@@ -67,7 +79,7 @@ def levanter_to_nnx_state(levanter_model: LmHeadModel) -> dict:
                     # pad 3rd dimension to 128 (e.g., (8, 2048, 64) -> (8, 2048, 128))
                     value = jnp.pad(value, ((0, 0), (0, 0), (0, next_multiple_of_128 - head_size)))
 
-        current[split_key_without_weight[-1]] = nnx.Param(value)
+        current[_get_nnx_key_name(split_key)] = nnx.Param(value)
     return nnx.State(nested_state_dict)
 
 
@@ -89,31 +101,46 @@ def levanter_state_dict_to_nnx_state_on_cpu(state_dict: dict) -> dict:
                     current[part] = {}
                 current = current[part]
 
-            # for q, k, v projections, we need to pad the 2nd dimension to next multiple of 128
-            # vLLM expects the weights to be padded to the next multiple of 128. I assume this is
-            # because they want to use Pallas kernels which have this requirement.
+            # vLLM requires weights/biases to be padded to the nearest multiple of 128 for Pallas kernels.
             if "self_attn" in split_key_without_weight:
+                is_bias = split_key[-1] == "bias"
+
+                # Flatten grouped query heads -> (Total Heads, Head Dim, [Embed]) for vLLM
                 if "q_proj" in split_key_without_weight:
-                    kv_heads, q_heads_per_group, head_size, embed = value.shape
-                    value = value.reshape(kv_heads * q_heads_per_group, head_size, embed)
+                    if len(value.shape) == 4:
+                        # Weight: (KV, Group, HeadSize, Embed) -> (Heads, HeadSize, Embed)
+                        kv_heads, q_heads_per_group, head_size, embed = value.shape
+                        value = value.reshape(kv_heads * q_heads_per_group, head_size, embed)
+                    elif len(value.shape) == 3 and is_bias:
+                        # Bias: (KV, Group, HeadSize) -> (Heads, HeadSize)
+                        kv_heads, q_heads_per_group, head_size = value.shape
+                        value = value.reshape(kv_heads * q_heads_per_group, head_size)
 
+                # Pad the head dimension (dim 1) for Q/K/V projections
                 if (
                     "q_proj" in split_key_without_weight
                     or "k_proj" in split_key_without_weight
                     or "v_proj" in split_key_without_weight
                 ):
-                    _heads, head_size, embed = value.shape
-                    next_multiple_of_128 = ((head_size + 127) // 128) * 128
-                    if head_size < next_multiple_of_128:
-                        # pad 2nd dimension to 128 (e.g., (8, 64, 2048) -> (8, 128, 2048))
-                        value = jnp.pad(value, ((0, 0), (0, next_multiple_of_128 - head_size), (0, 0)))
+                    pad_axis = 1
+                    if len(value.shape) >= 2:
+                        head_size = value.shape[pad_axis]
+                        next_multiple_of_128 = ((head_size + 127) // 128) * 128
+
+                        if head_size < next_multiple_of_128:
+                            padding = [(0, 0)] * len(value.shape)
+                            padding[pad_axis] = (0, next_multiple_of_128 - head_size)
+                            value = jnp.pad(value, padding)
+
+                # Pad o_proj weights along the head dimension (dim 2)
                 elif "o_proj" in split_key_without_weight:
-                    embed, _heads, head_size = value.shape
-                    next_multiple_of_128 = ((head_size + 127) // 128) * 128
-                    if head_size < next_multiple_of_128:
-                        # pad 3rd dimension to 128 (e.g., (8, 2048, 64) -> (8, 2048, 128))
-                        value = jnp.pad(value, ((0, 0), (0, 0), (0, next_multiple_of_128 - head_size)))
+                    # Weight: (Embed, Heads, HeadSize). Skip bias as it is 1D (Embed,) or handled differently.
+                    if not is_bias and len(value.shape) == 3:
+                        embed, _heads, head_size = value.shape
+                        next_multiple_of_128 = ((head_size + 127) // 128) * 128
+                        if head_size < next_multiple_of_128:
+                            value = jnp.pad(value, ((0, 0), (0, 0), (0, next_multiple_of_128 - head_size)))
 
-            current[split_key_without_weight[-1]] = nnx.Param(value)
+            current[_get_nnx_key_name(split_key)] = nnx.Param(value)
 
         return nnx.State(nested_state_dict)