quick fix conversion

NouamaneTazi · NouamaneTazi · commit d46bb457c8b3 · 2025-12-11T10:04:29.000Z
diff --git a/examples/llama/convert_weights.py b/examples/llama/convert_weights.py
@@ -84,6 +84,7 @@ def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:
         "use_cache": "use_cache",
         "vocab_size": "vocab_size",
         "attention_bias": "attention_bias",
+        "rope_interleaved": "rope_interleaved",
     }
     if nt_to_hf:
         return {nt: hf for hf, nt in hf_to_nt_map.items()}
diff --git a/examples/llama/tests/test_conversion.py b/examples/llama/tests/test_conversion.py
@@ -1,9 +1,14 @@
 # ruff: noqa: E402
 import dataclasses
 import json
+import sys
 from pathlib import Path
 from typing import Optional
 
+_this_file = Path(__file__).resolve()
+_nanotron_root = _this_file.parent.parent.parent.parent  # tests -> llama -> examples -> nanotron
+sys.path.insert(0, str(_nanotron_root))
+
 import pytest
 import torch
 from transformers import AutoModelForCausalLM, LlamaForCausalLM
@@ -12,6 +17,7 @@
 set_system_path()
 
 import nanotron
+
 from nanotron import distributed as dist
 from nanotron.config import LlamaConfig as NanotronLlamaConfig
 from nanotron.config import NanotronConfigs
@@ -84,7 +90,7 @@
             "tie_word_embeddings": False,
             "use_cache": True,
             "vocab_size": 4096,
-            "_attn_implementation": "sdpa",
+            "_attn_implementation": "flash_attention_2",
             "attention_bias": False,
             "rope_interleaved": False,
         }
@@ -117,8 +123,7 @@ def create_huggingface_model(model_name: Optional[str] = None) -> LlamaForCausal
         with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
             model_hf = LlamaForCausalLM._from_config(get_hf_config(CONFIG))
     else:
-        with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
-            model_hf = AutoModelForCausalLM.from_pretrained(model_name)
+        model_hf = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).cuda()
     return model_hf
 
 
@@ -336,13 +341,13 @@ def test_tensor_parallel_conversion(input_ids: torch.Tensor):
     # run all tests
     # test_nt_to_hf(input_ids=torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda"))
     # test_hf_to_nt(input_ids=torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda"))
-    test_tensor_parallel_conversion(
-        input_ids=torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
-    )
+    # test_tensor_parallel_conversion(
+    #     input_ids=torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
+    # )
 
     # Warning: Converting from HF to Nanotron is a better test because we don't initialize weights in standard way. (e.g. Layernorms)
     # Test SmolLM2-135M
-    # test_hf_to_nt(
-    #     input_ids=torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda"),
-    #     model_name="HuggingFaceTB/SmolLM2-135M",
-    # )
+    test_hf_to_nt(
+        input_ids=torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda"),
+        model_name="HuggingFaceTB/SmolLM2-135M",
+    )
diff --git a/src/nanotron/models/qwen.py b/src/nanotron/models/qwen.py
@@ -208,7 +208,7 @@ def __init__(
                 max_seq_len=config.max_position_embeddings,
                 base=config.rope_theta,
                 interleaved=config.rope_interleaved,
-                seq_len_scaling_factor=config.rope_seq_len_scaling_factor,
+                seq_len_scaling_factor=config.rope_seq_len_interpolation_factor,
                 fused=config._fused_rotary_emb,
             )
         self.attention = CoreAttention(config, tp_pg, cp_pg, layer_idx)
@@ -238,28 +238,28 @@ def forward(
 
         if self._use_qkv_packed:
             attn_output = self._forward_packed(qkv, seq_length, position_ids, cu_seqlens)
-        # else:
-        #     q, k, v = qkv.split(
-        #         [self.local_q_size, self.local_kv_size, self.local_kv_size], dim=-1
-        #     )  # [batch_size*seq_length, q_size], [batch_size*seq_length, kv_size]
-        #     q = q.view(-1, self.local_num_heads, self.head_dim)  # [b*s, num_heads, head_dim]
-        #     k = k.view(-1, self.local_num_kv_heads, self.head_dim)  # [b*s, num_kv_heads, head_dim]
-        #     v = v.view(-1, self.local_num_kv_heads, self.head_dim)  # [b*s, num_kv_heads, head_dim]
-        #     if self.config.no_rope_layer is None or (self.layer_idx + 1) % self.config.no_rope_layer != 0:
-        #         rotary_pos_emb = self.rotary_emb(
-        #             position_ids=position_ids if not self.simple_causal_mask else None, seq_length=seq_length
-        #         )  # [b*s, dim] or [seq_length, dim]
-        #         q = self.rotary_emb.apply_rotary_pos_emb(
-        #             q, rotary_pos_emb, seq_length=seq_length
-        #         )  # [b*s, num_heads, head_dim]
-        #         k = self.rotary_emb.apply_rotary_pos_emb(
-        #             k, rotary_pos_emb, seq_length=seq_length
-        #         )  # [b*s, num_kv_heads, head_dim]
-        #     else:
-        #         log_rank(f"skipping rotary for layer {self.layer_idx + 1}", logger=logger, level=logging.DEBUG, rank=0)
-        #     attn_output = self.attention(
-        #         q, k, v, position_ids=position_ids, seq_length=seq_length, cu_seqlens=cu_seqlens
-        #     )
+        else:
+            q, k, v = qkv.split(
+                [self.local_q_size, self.local_kv_size, self.local_kv_size], dim=-1
+            )  # [batch_size*seq_length, q_size], [batch_size*seq_length, kv_size]
+            q = q.view(-1, self.local_num_heads, self.head_dim)  # [b*s, num_heads, head_dim]
+            k = k.view(-1, self.local_num_kv_heads, self.head_dim)  # [b*s, num_kv_heads, head_dim]
+            v = v.view(-1, self.local_num_kv_heads, self.head_dim)  # [b*s, num_kv_heads, head_dim]
+            if self.config.no_rope_layer is None or (self.layer_idx + 1) % self.config.no_rope_layer != 0:
+                rotary_pos_emb = self.rotary_emb(
+                    position_ids=position_ids if not self.simple_causal_mask else None, seq_length=seq_length
+                )  # [b*s, dim] or [seq_length, dim]
+                q = self.rotary_emb.apply_rotary_pos_emb(
+                    q, rotary_pos_emb, seq_length=seq_length
+                )  # [b*s, num_heads, head_dim]
+                k = self.rotary_emb.apply_rotary_pos_emb(
+                    k, rotary_pos_emb, seq_length=seq_length
+                )  # [b*s, num_kv_heads, head_dim]
+            else:
+                log_rank(f"skipping rotary for layer {self.layer_idx + 1}", logger=logger, level=logging.DEBUG, rank=0)
+            attn_output = self.attention(
+                q, k, v, position_ids=position_ids, seq_length=seq_length, cu_seqlens=cu_seqlens
+            )
         output = self.o_proj(attn_output)
         # Return original position_ids shape
         return {"hidden_states": output, "position_ids": position_ids.view(-1, seq_length)}
diff --git a/src/nanotron/nn/rotary.py b/src/nanotron/nn/rotary.py
@@ -141,6 +141,9 @@ def apply_rotary_pos_emb(self, tensor, freqs, multi_latent_attention=False, msca
                 self.rotate_half(rotary_part) * self.sin_values.unsqueeze(1)
             )
 
+        # Reshape back to [b*s, nheads, dim]
+        rotated_tensor = rotated_tensor.view(-1, rotated_tensor.shape[2], rotated_tensor.shape[3])
+
         # Concatenate with the pass-through part (if any)
         if pass_through_part is not None and pass_through_part.shape[-1] > 0:
             return torch.cat((rotated_tensor, pass_through_part), dim=-1)

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:`
`84`	`84`	`"use_cache": "use_cache",`
`85`	`85`	`"vocab_size": "vocab_size",`
`86`	`86`	`"attention_bias": "attention_bias",`
	`87`	`+ "rope_interleaved": "rope_interleaved",`
`87`	`88`	`}`
`88`	`89`	`if nt_to_hf:`
`89`	`90`	`return {nt: hf for hf, nt in hf_to_nt_map.items()}`
Original file line number	Diff line number	Diff line change
`@@ -141,6 +141,9 @@ def apply_rotary_pos_emb(self, tensor, freqs, multi_latent_attention=False, msca`
`141`	`141`	`self.rotate_half(rotary_part) * self.sin_values.unsqueeze(1)`
`142`	`142`	`)`
`143`	`143`
	`144`	`+ # Reshape back to [b*s, nheads, dim]`
	`145`	`+ rotated_tensor = rotated_tensor.view(-1, rotated_tensor.shape[2], rotated_tensor.shape[3])`
	`146`	`+`
`144`	`147`	`# Concatenate with the pass-through part (if any)`
`145`	`148`	`if pass_through_part is not None and pass_through_part.shape[-1] > 0:`
`146`	`149`	`return torch.cat((rotated_tensor, pass_through_part), dim=-1)`