[quantizer] support {LayerNorm, RMSNorm} (#390)

zk1998 · web-flow · commit c9d0ac571086 · 2025-03-12T21:29:05.000+08:00
* [quantzier]support LayerNorm/RMSNorm

* [quantzier] add doc support

* [quantzier] fix test
diff --git a/docs/quantization_support.md b/docs/quantization_support.md
@@ -27,6 +27,7 @@ Quantized OPs that are natively not supported by PyTorch (and possibly TFLite).
 | `pow` | / |
 | `prelu` | / |
 | `reciprocal` | / |
+| `rsqrt` | / |
 | `silu` | / |
 | `sin` | / |
 | `softmax` | / |
@@ -71,8 +72,10 @@ Quantized OPs that are natively not supported by PyTorch (and possibly TFLite).
 | `sum` | For TFLiteConverter, set `rewrite_quantizable=True` |
 | `torch.nn.GLU` | No action needed |
 | `torch.nn.Hardsigmoid` | No action needed |
+| `torch.nn.LayerNorm` | No action needed |
 | `torch.nn.LogSoftmax` | For QATQuantizer/PostQuantizer, set `config={"set_quantizable_op_stats": True}`<br>For TFLiteConverter, set `rewrite_quantizable=True` |
 | `torch.nn.PReLU` | No action needed |
+| `torch.nn.RMSNorm` | No action needed |
 | `torch.nn.SiLU` | No action needed |
 | `torch.nn.Softmax` | For QATQuantizer/PostQuantizer, set `config={"set_quantizable_op_stats": True}`<br>For TFLiteConverter, set `rewrite_quantizable=True` |
 | `truediv` | For TFLiteConverter, set `rewrite_quantizable=True` |
diff --git a/tests/qat_module_test.py b/tests/qat_module_test.py
@@ -1,8 +1,9 @@
 import unittest
+import random
 
 import torch
 import torch.nn as nn
-from tinynn.graph.quantization.modules import QGLU, QPReLU, QSiLU
+from tinynn.graph.quantization.modules import QGLU, QPReLU, QSiLU, QLayerNorm, QRMSNorm
 
 
 class QATModuleTester(unittest.TestCase):
@@ -89,6 +90,55 @@ def test_glu(self):
 
                 self.assertTrue(False)
 
+    def test_layer_norm(self):
+        for i in range(100):
+            normalized_shape = tuple(random.randint(10, 100) for _ in range(random.randint(1, 3)))
+            non_normalized_shape = tuple(random.randint(1, 100) for _ in range(random.randint(1, 2)))
+
+            orig = nn.LayerNorm(normalized_shape)
+            quant = QLayerNorm(orig)
+
+            inp = torch.randn((*non_normalized_shape, *normalized_shape))
+
+            orig_outp = orig(inp)
+            quant_outp = quant(inp)
+
+            if not torch.allclose(orig_outp, quant_outp, atol=1e-6):
+                print(normalized_shape, non_normalized_shape)
+                print('original:')
+                print(orig_outp)
+                print('quanted:')
+                print(quant_outp)
+
+                print('diff (min, max):', torch.max(quant_outp - orig_outp), torch.min(quant_outp - orig_outp))
+
+                self.assertTrue(False)
+
+    @unittest.skipIf(not hasattr(torch.nn, 'RMSNorm'), 'RMSNorm is not supported')
+    def test_rms_norm(self):
+        for i in range(100):
+            normalized_shape = tuple(random.randint(10, 100) for _ in range(random.randint(1, 3)))
+            non_normalized_shape = tuple(random.randint(1, 100) for _ in range(random.randint(1, 2)))
+
+            orig = nn.RMSNorm(normalized_shape)
+            quant = QRMSNorm(orig)
+
+            inp = torch.randn((*non_normalized_shape, *normalized_shape))
+
+            orig_outp = orig(inp)
+            quant_outp = quant(inp)
+
+            if not torch.allclose(orig_outp, quant_outp, atol=1e-6):
+                print(normalized_shape, non_normalized_shape)
+                print('original:')
+                print(orig_outp)
+                print('quanted:')
+                print(quant_outp)
+
+                print('diff (min, max):', torch.max(quant_outp - orig_outp), torch.min(quant_outp - orig_outp))
+
+                self.assertTrue(False)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/quantizer_test.py b/tests/quantizer_test.py
@@ -380,6 +380,7 @@ def forward(self, x):
                 return self.norm(y[0])
 
         model = Model()
+        torch.nn.init.uniform_(model.norm.bias, -0.1, 0.1)
         inputs = torch.randn(1, 3, 224, 224)
 
         check_quantize_rewrite(model, inputs)
diff --git a/tinynn/graph/quantization/modules.py b/tinynn/graph/quantization/modules.py
@@ -89,3 +89,86 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         x2 = self.act_r(x1)
         x3 = self.q(self.dq(x2))
         return self.f_mul.mul_scalar(x3, 1 / 6)
+
+
+class QLayerNorm(nn.Module):
+    def __init__(self, layernorm: nn.LayerNorm) -> None:
+        super().__init__()
+        self.mean_dims = tuple(range(-len(layernorm.normalized_shape), 0))
+        self.weight = torch.nn.Parameter(layernorm.weight.data.detach().clone())
+        self.bias = torch.nn.Parameter(layernorm.bias.data.detach().clone())
+        self.eps = layernorm.eps
+
+        self.q_rsqrt = torch_q.QuantStub()
+        self.q_weight = torch_q.QuantStub()
+        self.q_bias = torch_q.QuantStub()
+        self.dq_rsqrt = torch_q.DeQuantStub()
+
+        self.f_neg = nnq.FloatFunctional()
+        self.f_add_0 = nnq.FloatFunctional()
+        self.f_mul_0 = nnq.FloatFunctional()
+        self.f_add_1 = nnq.FloatFunctional()
+        self.f_mul_1 = nnq.FloatFunctional()
+        self.f_mul_2 = nnq.FloatFunctional()
+        self.f_add_2 = nnq.FloatFunctional()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # LayerNorm(input) = (input - mean(input)) * rsqrt(mean(( input - mean(input) )**20) + eps ) * alpha + beta
+        # Currently, we completely split LayerNorm and independently count the quantization parameters
+        # of the intermediate activation value, which may lead to a decrease in quantization accuracy.
+
+        mean = input.mean(self.mean_dims, keepdim=True)
+        diff = self.f_add_0.add(input, self.f_neg.mul_scalar(mean, -1.0).expand_as(input))
+        squarer_difference = self.f_mul_0.mul(diff, diff)
+        var = squarer_difference.mean(self.mean_dims, keepdim=True)
+        var_eps = self.f_add_1.add_scalar(var, self.eps)
+
+        fdq_var_eps = self.dq_rsqrt(var_eps)
+        std_inverse = torch.rsqrt(fdq_var_eps)
+        q_std_inverse = self.q_rsqrt(std_inverse)
+
+        weight_fq = self.q_weight(self.weight)
+        bias_fq = self.q_bias(self.bias)
+        norm = self.f_mul_1.mul(diff, q_std_inverse)
+        weight_fq_expand = weight_fq.expand_as(norm)
+        norm_alpha = self.f_mul_2.mul(norm, weight_fq_expand)
+        bias_fq_expand = bias_fq.expand_as(norm_alpha)
+        return self.f_add_2.add(norm_alpha, bias_fq_expand)
+
+
+class QRMSNorm(nn.Module):
+    def __init__(self, rmsnorm: 'nn.RMSNorm') -> None:
+        super().__init__()
+        self.mean_dims = tuple(range(-len(rmsnorm.normalized_shape), 0))
+        self.weight = torch.nn.Parameter(rmsnorm.weight.data.detach().clone())
+        self.eps = rmsnorm.eps
+
+        self.q_rsqrt = torch_q.QuantStub()
+        self.q_weight = torch_q.QuantStub()
+        self.dq_rsqrt = torch_q.DeQuantStub()
+
+        self.f_add_0 = nnq.FloatFunctional()
+        self.f_mul_0 = nnq.FloatFunctional()
+        self.f_mul_1 = nnq.FloatFunctional()
+        self.f_mul_2 = nnq.FloatFunctional()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        # RMSNorm(input) = (input) * rsqrt(mean(input**2)) + eps ) * alpha + beta
+
+        squard_input = self.f_mul_0.mul(input, input)
+        if self.eps is None:
+            rms_pre = squard_input.mean(self.mean_dims, keepdim=True)
+        else:
+            rms_pre = self.f_add_0.add_scalar(
+                squard_input.mean(self.mean_dims, keepdim=True),
+                self.eps,
+            )
+
+        fdq_rms_pre = self.dq_rsqrt(rms_pre)
+        rms_inverse = torch.rsqrt(fdq_rms_pre)
+        q_rms = self.q_rsqrt(rms_inverse)
+
+        weight_fq = self.q_weight(self.weight)
+        norm = self.f_mul_1.mul(input, q_rms)
+        weight_fq_expand = weight_fq.expand_as(norm)
+        return self.f_mul_2.mul(norm, weight_fq_expand)
diff --git a/tinynn/graph/quantization/quantizer.py b/tinynn/graph/quantization/quantizer.py
@@ -23,7 +23,7 @@
     FakeQuantizeBFloat16,
     FakeQuantizeTFLite,
 )
-from tinynn.graph.quantization.modules import QGLU, QHardsigmoid, QPReLU, QSiLU
+from tinynn.graph.quantization.modules import QGLU, QHardsigmoid, QPReLU, QSiLU, QLayerNorm, QRMSNorm
 from tinynn.graph.quantization.observer import (
     HistogramObserverKL,
     MinMaxObserver,
@@ -174,6 +174,7 @@
     'pow': None,
     'truediv': None,
     'sqrt': None,
+    'rsqrt': None,
     'atan2': None,
     'atan': None,
     'sin': None,
@@ -263,8 +264,11 @@
     Q_MODULES_MAPPING.update({nn.SiLU: QSiLU})
     FUNCTIONAL_MODULE_MAPPING.update({'silu': nn.SiLU})
 
+if hasattr(nn, 'LayerNorm'):
+    Q_MODULES_MAPPING.update({nn.LayerNorm: QLayerNorm})
+
 if hasattr(nn, 'RMSNorm'):
-    UNSUPPORTED_PYTORCH_QUANTIZATION_OP_LIST.update({nn.RMSNorm: None})
+    Q_MODULES_MAPPING.update({nn.RMSNorm: QRMSNorm})
 
 # Processed QAT fuse rules
 processed_qat_rules = {}