pytorch
diff --git a/‎tests/recipes/dev/test_generate_v2.py
Lines changed: 1 addition & 3 deletions b/‎tests/recipes/dev/test_generate_v2.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/torchtune/models/t5/test_t5_encoder.py
Lines changed: 16 additions & 16 deletions b/‎tests/torchtune/models/t5/test_t5_encoder.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎torchtune/models/gemma/_component_builders.py
Lines changed: 60 additions & 53 deletions b/‎torchtune/models/gemma/_component_builders.py
Lines changed: 60 additions & 53 deletions
diff --git a/‎torchtune/models/gemma2/_component_builders.py
Lines changed: 3 additions & 5 deletions b/‎torchtune/models/gemma2/_component_builders.py
Lines changed: 3 additions & 5 deletions
@@ -55,9 +55,7 @@ def test_llama2_generate_results(self, caplog, monkeypatch, tmpdir):
         # this is gibberish b/c the model is random weights, but it's
         # the expected value for what we currently have in V2
         # this test should catch any changes to the generate recipe that affect output
-        expected_output = (
-            "Country maior Connection Kohćutsójcustomulas Sometimes Security"
-        )
+        expected_output = "Pietroместkap щotimes rivers cache НиtringindexPathNAME"
 
         logs = caplog.text
         assert expected_output in logs
 
@@ -51,24 +51,24 @@ def test_forward(self, model, inputs):
         expected = torch.tensor(
             [
                 [
-                    [0.3670, 0.2938],
-                    [0.3692, 0.2921],
-                    [0.3611, 0.2984],
-                    [0.4207, 0.2437],
-                    [0.3447, 0.3106],
-                    [0.3383, 0.3150],
-                    [0.3727, 0.2892],
-                    [0.3996, 0.2653],
+                    [0.1940, 0.5625],
+                    [0.1893, 0.5681],
+                    [0.2020, 0.5522],
+                    [0.2547, 0.4681],
+                    [0.1769, 0.5822],
+                    [0.2737, 0.4281],
+                    [0.2828, 0.4066],
+                    [0.2841, 0.4033],
                 ],
                 [
-                    [0.3855, 0.2783],
-                    [0.2627, 0.3581],
-                    [0.3601, 0.2992],
-                    [0.3473, 0.3087],
-                    [0.3549, 0.3032],
-                    [0.2871, 0.3459],
-                    [0.2753, 0.3520],
-                    [0.2285, 0.3728],
+                    [0.1796, 0.5792],
+                    [0.2020, 0.5523],
+                    [0.2209, 0.5258],
+                    [0.2802, 0.4128],
+                    [0.2923, 0.3817],
+                    [0.2677, 0.4414],
+                    [0.2458, 0.4847],
+                    [0.1923, 0.5645],
                 ],
             ]
         )
 
@@ -76,33 +76,37 @@ def gemma(
         TransformerDecoder: Instantiation of gemma model.
     """
     rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
-    self_att = MultiHeadAttention(
-        embed_dim=embed_dim,
-        num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
-        head_dim=head_dim,
-        q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
-        k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
-        v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
-        output_proj=nn.Linear(num_heads * head_dim, embed_dim, bias=False),
-        pos_embeddings=rope,
-        kv_cache=None,
-        max_seq_len=max_seq_len,
-        attn_dropout=attn_dropout,
-    )
-    mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim)
-    layer = TransformerSelfAttentionLayer(
-        attn=self_att,
-        mlp=mlp,
-        sa_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-        mlp_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-    )
+    
+    layers = nn.ModuleList()
+    for _ in range(num_layers):
+        self_att = MultiHeadAttention(
+            embed_dim=embed_dim,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
+            k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+            v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+            output_proj=nn.Linear(num_heads * head_dim, embed_dim, bias=False),
+            pos_embeddings=rope,
+            kv_cache=None,
+            max_seq_len=max_seq_len,
+            attn_dropout=attn_dropout,
+        )
+        mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim)
+        layer = TransformerSelfAttentionLayer(
+            attn=self_att,
+            mlp=mlp,
+            sa_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
+            mlp_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
+        )
+        layers.append(layer)
+
     tok_embeddings = GemmaNormEmbeddings(vocab_size, embed_dim)
     output_proj = TiedLinear(tok_embeddings)
     model = TransformerDecoder(
         tok_embeddings=tok_embeddings,
-        layers=layer,
-        num_layers=num_layers,
+        layers=layers,
         max_seq_len=max_seq_len,
         num_heads=num_heads,
         output=output_proj,
@@ -186,47 +190,50 @@ def lora_gemma(
         TransformerDecoder: Instantiation of Gemma model with LoRA applied to
         a subset of the attention projections in each layer.
     """
-    self_attn = lora_gemma_self_attention(
-        lora_modules=lora_attn_modules,
-        embed_dim=embed_dim,
-        head_dim=head_dim,
-        num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
-        max_seq_len=max_seq_len,
-        attn_dropout=attn_dropout,
-        rope_base=rope_base,
-        lora_rank=lora_rank,
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
-        use_dora=use_dora,
-        quantize_base=quantize_base,
-    )
-
-    if apply_lora_to_mlp:
-        mlp = lora_gemma_mlp(
-            dim=embed_dim,
-            hidden_dim=intermediate_dim,
+    layers = nn.ModuleList()
+    for _ in range(num_layers):
+        self_attn = lora_gemma_self_attention(
+            lora_modules=lora_attn_modules,
+            embed_dim=embed_dim,
+            head_dim=head_dim,
+            num_heads=num_heads,
+            num_kv_heads=num_kv_heads,
+            max_seq_len=max_seq_len,
+            attn_dropout=attn_dropout,
+            rope_base=rope_base,
             lora_rank=lora_rank,
             lora_alpha=lora_alpha,
             lora_dropout=lora_dropout,
             use_dora=use_dora,
             quantize_base=quantize_base,
         )
-    else:
-        mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim, quantize_base=quantize_base)
 
-    layer = TransformerSelfAttentionLayer(
-        attn=self_attn,
-        mlp=mlp,
-        sa_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-        mlp_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-    )
+        if apply_lora_to_mlp:
+            mlp = lora_gemma_mlp(
+                dim=embed_dim,
+                hidden_dim=intermediate_dim,
+                lora_rank=lora_rank,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                use_dora=use_dora,
+                quantize_base=quantize_base,
+            )
+        else:
+            mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim, quantize_base=quantize_base)
+
+        layer = TransformerSelfAttentionLayer(
+            attn=self_attn,
+            mlp=mlp,
+            sa_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
+            mlp_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
+        )
+        layers.append(layer)
+
     tok_embeddings = GemmaNormEmbeddings(vocab_size, embed_dim)
     output_proj = TiedLinear(tok_embeddings)
     model = TransformerDecoder(
         tok_embeddings=tok_embeddings,
-        layers=layer,
-        num_layers=num_layers,
+        layers=layers,
         max_seq_len=max_seq_len,
         num_heads=num_heads,
         output=output_proj,
 
@@ -4,9 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from torch import nn
 import torch
-from typing import List
+from torch import nn
 from torchtune.modules.common_utils import _register_reparametrize_state_dict_hooks
 from typing import List, Optional
 
@@ -116,7 +115,6 @@ def gemma2(
     rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
 
     layers = torch.nn.ModuleList()
-    
     for layer_idx in range(num_layers):
 
         mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim)
@@ -149,6 +147,7 @@ def gemma2(
             mlp_scale=GemmaRMSNorm(embed_dim, eps=norm_eps),
         )
         layers.append(layer)
+
     tok_embeddings = GemmaNormEmbeddings(vocab_size, embed_dim)
     output_proj = TiedLinear(tok_embeddings)
     model = TransformerDecoder(
@@ -231,8 +230,7 @@ def lora_gemma2(
     tok_embeddings = GemmaNormEmbeddings(vocab_size, embed_dim)
     output_proj = TiedLinear(tok_embeddings)
 
-    layers = torch.nn.ModuleList()
-    
+    layers = nn.ModuleList()
     for layer_idx in range(num_layers):
         if apply_lora_to_mlp:
             mlp = lora_gemma_mlp(