fea(Gemma3/2): Added FusedRMSNorm (#2281)

Iman Gohari · web-flow · commit 1ce0173309db · 2025-09-24T18:43:32.000+02:00
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
@@ -260,6 +260,8 @@
     gaudi_falcon_linear_forward,
     gaudi_FalconMambaForCausalLM_prepare_inputs_for_generation,
     gaudi_FalconMambaModel_forward,
+    gaudi_gemma2_rmsnorm_forward,
+    gaudi_gemma3_rmsnorm_forward,
     gaudi_generate_speech,
     gaudi_get_extended_attention_mask,
     gaudi_gpt2_forward,
@@ -615,6 +617,7 @@ def adapt_transformers_to_gaudi():
     transformers.models.gemma2.modeling_gemma2.Gemma2DecoderLayer = GaudiGemma2DecoderLayer
     transformers.models.gemma2.modeling_gemma2.Gemma2Model = GaudiGemma2Model
     transformers.models.gemma2.modeling_gemma2.Gemma2RotaryEmbedding = GaudiGemma2RotaryEmbedding
+    transformers.models.gemma2.modeling_gemma2.Gemma2RMSNorm.forward = gaudi_gemma2_rmsnorm_forward
 
     # Optimization for gemma3 on Gaudi
     transformers.models.gemma3.modeling_gemma3.Gemma3ForCausalLM = GaudiGemma3ForCausalLM
@@ -624,6 +627,7 @@ def adapt_transformers_to_gaudi():
     transformers.models.gemma3.modeling_gemma3.Gemma3TextModel = GaudiGemma3TextModel
     transformers.models.gemma3.modeling_gemma3.Gemma3Model = GaudiGemma3Model
     transformers.models.gemma3.modeling_gemma3.Gemma3ForConditionalGeneration = GaudiGemma3ForConditionalGeneration
+    transformers.models.gemma3.modeling_gemma3.Gemma3RMSNorm.forward = gaudi_gemma3_rmsnorm_forward
 
     # Optimization for blip Text model on Gaudi
     transformers.models.blip.BlipTextModel.forward = gaudi_BlipTextModel_forward
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
@@ -117,6 +117,7 @@
     GaudiGemma2MLP,
     GaudiGemma2Model,
     GaudiGemma2RotaryEmbedding,
+    gaudi_gemma2_rmsnorm_forward,
 )
 from .gemma3 import (
     GaudiGemma3Attention,
@@ -126,6 +127,7 @@
     GaudiGemma3MLP,
     GaudiGemma3Model,
     GaudiGemma3TextModel,
+    gaudi_gemma3_rmsnorm_forward,
 )
 from .glm4v import (
     ChatGLM4Tokenizer,
diff --git a/optimum/habana/transformers/models/gemma2/__init__.py b/optimum/habana/transformers/models/gemma2/__init__.py
@@ -5,4 +5,5 @@
     GaudiGemma2MLP,
     GaudiGemma2Model,
     GaudiGemma2RotaryEmbedding,
+    gaudi_gemma2_rmsnorm_forward,
 )
diff --git a/optimum/habana/transformers/models/gemma2/modeling_gemma2.py b/optimum/habana/transformers/models/gemma2/modeling_gemma2.py
@@ -74,6 +74,19 @@ def __init__(self, config: Gemma2Config):
         super().__init__(config=config)
 
 
+def gaudi_gemma2_rmsnorm_forward(self, x):
+    if x.device.type == "hpu" and FusedRMSNorm is not None:
+        output = FusedRMSNorm.apply(x.float(), torch.ones_like(self.weight), self.eps)
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+    else:
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma2 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+
 def gaudi_gemma2_repeat_kv(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
diff --git a/optimum/habana/transformers/models/gemma3/__init__.py b/optimum/habana/transformers/models/gemma3/__init__.py
@@ -6,4 +6,5 @@
     GaudiGemma3MLP,
     GaudiGemma3Model,
     GaudiGemma3TextModel,
+    gaudi_gemma3_rmsnorm_forward,
 )
diff --git a/optimum/habana/transformers/models/gemma3/modeling_gemma3.py b/optimum/habana/transformers/models/gemma3/modeling_gemma3.py
@@ -72,6 +72,19 @@
 logger = logging.get_logger(__name__)
 
 
+def gaudi_gemma3_rmsnorm_forward(self, x):
+    if x.device.type == "hpu" and FusedRMSNorm is not None:
+        output = FusedRMSNorm.apply(x.float(), torch.ones_like(self.weight), self.eps)
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+    else:
+        output = self._norm(x.float())
+        # Llama does x.to(float16) * w whilst Gemma3 is (x * w).to(float16)
+        # See https://github.com/huggingface/transformers/pull/29402
+        output = output * (1.0 + self.weight.float())
+        return output.type_as(x)
+
+
 def gaudi_gemma3_repeat_kv(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
diff --git a/tests/baselines/fixture/tests/test_text_generation_example.json b/tests/baselines/fixture/tests/test_text_generation_example.json
@@ -244,54 +244,54 @@
       "throughput": 357.46365062825083
     }
   },
-  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-3-27b-it-1-False-True-False]": {
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-3-27b-it-1-False-False-False]": {
     "gaudi2": {
-      "output": "DeepSpeed is a machine learning framework designed to make distributed training easier and more efficient. It is created by Microsoft and has gained significant traction in the research and industry communities due to its ability to train large models with limited hardware resources.\nHere's a breakdown of the key features and benefits of DeepSpeed:\n\n**Core Features & Technologies:**\n\n*   **ZeRO (Zero Redundancy Optimizer):**  This is the cornerstone of DeepSpeed. It tackles the memory bottleneck in distributed training by partitioning model states (weights,",
-      "throughput": 32.452270975799294
+      "output": "DeepSpeed is a machine learning framework that enables you to train models with hundreds of billions or even trillions of parameters. Here's a breakdown of what it is, its key features, and how it compares to other approaches:\n\n**What is DeepSpeed?**\n\nDeveloped by Microsoft, DeepSpeed is a deep learning optimization library designed to make large-scale model training more efficient, accessible, and cost-effective. It's built on PyTorch and is open-source. It's particularly notable for enabling the training of",
+      "throughput": 34.082512922376125
     },
     "gaudi3": {
-      "output": "DeepSpeed is a machine learning framework that enables you to train models with hundreds of billions or even trillions of parameters. Here's a breakdown of what it is, its key features, and how it compares to other approaches:\n\n**What is DeepSpeed?**\n\nDeveloped by Microsoft, DeepSpeed is a deep learning optimization library designed to make large-scale model training more efficient, accessible, and cost-effective. It's built on PyTorch and is open-source. It's particularly notable for enabling the training of",
-      "throughput": 38.03334992095207
+      "output": "DeepSpeed is a machine learning framework that enables training very large models with high efficiency. It is often used with PyTorch, and it can significantly reduce memory usage and increase training throughput. Here's a breakdown of how it works, its key features, and how to use it:\n\n**How DeepSpeed Works**\n\nDeepSpeed achieves its efficiency through a combination of innovative techniques, primarily focusing on these areas:\n\n* **ZeRO (Zero Redundancy Optimizer):** This is the cornerstone of DeepSpeed. ZeRO partitions",
+      "throughput": 42.50246201556991
     }
   },
   "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-3-12b-it-1-False-True-False]": {
     "gaudi2": {
-      "output": "DeepSpeed is a machine learning framework focused on improving the performance and scalability of training deep learning models. It's designed to handle very large models and datasets, often used in areas like Natural Language Processing (NLP) and Computer Vision. Here's a breakdown of what DeepSpeed offers, how it works, and why it's valuable.\n\n**Key Features and Benefits**\n\n* **Scalability:** DeepSpeed's primary goal is to allow you to train models that are too large to fit on a single GPU or even",
-      "throughput": 58.022545212763546
+      "output": "DeepSpeed is a machine learning framework focused on improving the performance and scalability of training deep learning models. It's designed to handle very large models and datasets, often using techniques like model parallelism, data parallelism, and optimization techniques like ZeRO.\n\nHere's a breakdown of its core concepts and how it works, along with common use cases and examples:\n\n**Key Concepts & Techniques**\n\n* **ZeRO (Zero Redundancy Optimizer):** The cornerstone of DeepSpeed. ZeRO dramatically reduces memory consumption by partitioning model",
+      "throughput": 68.30196577764131
     },
     "gaudi3": {
-      "output": "DeepSpeed is a machine learning framework focused on improving the performance and scalability of training deep learning models. It's designed to handle very large models and datasets, often using techniques like model parallelism, data parallelism, and optimization techniques like ZeRO (Zero Redundancy Optimizer).\n\nHere's a breakdown of key concepts and functionalities within DeepSpeed:\n\n**1. Key Goals & Benefits**\n\n* **Scalability:**  DeepSpeed's primary goal is to enable training extremely large models (billions or even trillions of parameters",
-      "throughput": 69.15032921221514
+      "output": "DeepSpeed is a machine learning framework focused on improving the performance and scalability of training deep learning models. It's designed to handle very large models and datasets, often used in areas like Natural Language Processing (NLP) and Computer Vision. Here's a breakdown of its key features and how it works:\n\n**1. Core Technologies & Benefits**\n\n* **ZeRO (Zero Redundancy Optimizer):**  This is the *key* innovation in DeepSpeed.  ZeRO tackles the memory bottleneck that arises when training massive",
+      "throughput": 81.15713149929978
     }
   },
   "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-3-4b-it-1-False-True-False]": {
     "gaudi2": {
-      "output": "DeepSpeed is a machine learning framework focused on efficient training and inference of large models. It is built on PyTorch and aims to overcome the memory and computational limitations that often arise when training large neural networks.\n\nHere's a breakdown of DeepSpeed's key features, benefits, and how it works:\n\n**1. Core Technologies & Techniques:**\n\n* **ZeRO (Zero Redundancy Optimizer):** This is the cornerstone of DeepSpeed. It dramatically reduces memory consumption by partitioning the optimizer states, gradients, and parameters",
-      "throughput": 112.10840313346671
+      "output": "DeepSpeed is a machine learning framework focused on efficient training and inference of large models. It is developed by Microsoft and offers various optimizations like ZeRO, which is a memory partitioning technique to reduce memory footprint, and other features for faster training.\n\nHere's a breakdown of key aspects of DeepSpeed:\n\n**1. Core Technologies:**\n\n* **ZeRO (Zero Redundancy Optimizer):** This is the heart of DeepSpeed. It's a memory optimization technique that breaks down model parameters, gradients, and optimizer states",
+      "throughput": 141.28462701651054
     },
     "gaudi3": {
-      "output": "DeepSpeed is a machine learning framework focused on efficient training and serving of large models. It is built on PyTorch and aims to overcome the memory and computational limitations that often arise when training large neural networks.\n\nHere's a breakdown of DeepSpeed's key features, benefits, and how it works:\n\n**1. Core Technologies & Techniques:**\n\n* **ZeRO (Zero Redundancy Optimizer):** This is the cornerstone of DeepSpeed. It drastically reduces memory consumption by partitioning the optimizer states, gradients, and parameters",
-      "throughput": 125.14846550177148
+      "output": "DeepSpeed is a machine learning framework focused on efficient training and inference of large models. It is developed by Microsoft and offers various optimizations like ZeRO, which is a memory partitioning technique to reduce memory footprint, and other features such as data parallelism and pipeline parallelism.\n\nHere's a breakdown of its key features and how it compares to other frameworks like PyTorch and TensorFlow:\n\n**Key Features of DeepSpeed:**\n\n* **ZeRO (Zero Redundancy Optimizer):** This is the core of DeepSpeed. It comes in",
+      "throughput": 153.30181217767333
     }
   },
-  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-27b-1-False-True-False]": {
+  "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-27b-1-False-False-False]": {
     "gaudi2": {
-      "output": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and beyond, using model parallelism to partition large models over multiple GPUs.\n\nThe following is a brief introduction to the DeepSpeed model parallel training.\n\n<h2>1. Introduction</h2>\n\nThe DeepSpeed model parallel training is a simple and effective way to train large models. It is a framework that enables you to train models with trillions of parameters and beyond.\n\nDeepSpeed is a distributed deep learning optimization toolkit that makes it easy and efficient",
-      "throughput": 36.578709544111
+      "output": "DeepSpeed is a machine learning framework that is designed to help you train your models faster and more efficiently. DeepSpeed is a system that allows you to train your models faster and more efficiently. DeepSpeed is a machine learning framework that is designed to help you train your models faster and more efficiently. DeepSpeed is a system that allows you to train your models faster and more efficiently.\n\nI’m going to repeat myself. DeepSpeed is a machine learning framework that is designed to help you train your models faster and more efficiently.\n\n",
+      "throughput": 38.30642095055842
     },
     "gaudi3": {
-      "output": "DeepSpeed is a machine learning framework that enables you to train models with trillions of parameters and tera-scale datasets, while also providing the flexibility to customize the training process.\n\nThe DeepSpeed library is a system and deep learning optimization toolkit that makes it possible to efficiently train deep learning models with hundreds of billions of parameters and beyond. It includes a set of techniques that can be used together or separately to achieve the best possible performance for training deep learning models.\n\nDeepSpeed is a library that enables you to use these techniques in",
-      "throughput": 46.04685368495098
+      "output": "DeepSpeed is a machine learning framework that enables you to train deep learning models at any scale. It is a system and deep learning software optimization system that makes distributed deep learning practical. DeepSpeed allows researchers and engineers to train deep learning models with terabyte-scale with hundreds of billions of parameters with acceptable speed and accuracy.\n\nDeepSpeed is a deep learning optimization and parallelism library from Microsoft Research optimized for efficiency, developed to train large models efficiently on multiple GPUs.\n\nDeepSpeed is a system and library effort ongoing for over three years.",
+      "throughput": 48.842401849049224
     }
   },
   "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-2-9b-1-False-True-False]": {
     "gaudi2": {
       "output": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and highly scalable, making it a popular choice for training large-scale models such as GPT-3 and BERT.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework, and provides a set of tools and libraries that make it easy to train large-scale models. It includes features such as zero-shot learning, which allows models to",
-      "throughput": 92.302359446567
+      "throughput": 99.98690579203925
     },
     "gaudi3": {
       "output": "DeepSpeed is a machine learning framework that enables training of large-scale deep learning models on a single GPU or across multiple GPUs. It is designed to be easy to use and to provide high performance.\n\nDeepSpeed is built on top of PyTorch, a popular deep learning framework. It provides a number of features that make it easy to train large-scale models, including:\n\n* Automatic model parallelism: DeepSpeed automatically parallelizes the model across multiple GPUs, so you don’t have to worry about how to do it yourself",
-      "throughput": 111.60209707224463
+      "throughput": 117.69951320588835
     }
   },
   "tests/test_text_generation_example.py::test_text_generation_bf16_1x[google/gemma-7b-1-False-False]": {
@@ -837,4 +837,4 @@
       "throughput": 0.7583387
     }
   }
-}
+}
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
@@ -50,10 +50,10 @@
             # ("Qwen/Qwen1.5-7B", 4, False, False, False),
             ("google/gemma-7b", 1, False, True, False),
             ("google/gemma-2-9b", 1, False, True, False),
-            ("google/gemma-2-27b", 1, False, True, False),
+            ("google/gemma-2-27b", 1, False, False, False),
             ("google/gemma-3-4b-it", 1, False, True, False),
             ("google/gemma-3-12b-it", 1, False, True, False),
-            ("google/gemma-3-27b-it", 1, False, True, False),
+            ("google/gemma-3-27b-it", 1, False, False, False),
             pytest.param(
                 "state-spaces/mamba-130m-hf", 1536, False, False, False, marks=pytest.mark.skip("Deprecated")
             ),

Original file line number	Diff line number	Diff line change
`@@ -5,4 +5,5 @@`
`5`	`5`	`GaudiGemma2MLP,`
`6`	`6`	`GaudiGemma2Model,`
`7`	`7`	`GaudiGemma2RotaryEmbedding,`
	`8`	`+ gaudi_gemma2_rmsnorm_forward,`
`8`	`9`	`)`
Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,5 @@`
`6`	`6`	`GaudiGemma3MLP,`
`7`	`7`	`GaudiGemma3Model,`
`8`	`8`	`GaudiGemma3TextModel,`
	`9`	`+ gaudi_gemma3_rmsnorm_forward,`
`9`	`10`	`)`