Add SeqMSE support to aimet pass (microsoft#2158)

michaelgtuttle · web-flow · commit 3e2821114374 · 2025-10-01T18:29:20.000Z
## Describe your changes Adds sequential MSE support to aimet quantization pass. ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [ ] Update documents if necessary. - [ ] Lint and apply fixes to your code by running `lintrunner -a` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. - [ ] Is this PR including examples changes? If yes, please remember to update [example documentation](https://github.com/microsoft/Olive/blob/main/docs/source/examples.md) in a follow-up PR. ## (Optional) Issue link --------- Signed-off-by: Michael Tuttle <mtuttle@qti.qualcomm.com>
diff --git a/olive/olive_config.json b/olive/olive_config.json
@@ -602,7 +602,7 @@
         }
     },
     "extra_dependencies": {
-        "aimet-onnx": [ "aimet-onnx>=2.12.0" ],
+        "aimet-onnx": [ "aimet-onnx>=2.15.0" ],
         "auto-opt": [ "optimum" ],
         "azureml": [ "azure-ai-ml>=1.11.1", "azure-identity" ],
         "bnb": [ "bitsandbytes", "triton" ],
diff --git a/olive/passes/onnx/aimet_quantization.py b/olive/passes/onnx/aimet_quantization.py
@@ -171,6 +171,29 @@ def apply(  # pylint: disable=arguments-differ
         return sim
 
 
+class SeqMSE(_AimetTechnique):
+    @staticmethod
+    def apply(  # pylint: disable=arguments-differ
+        sim,
+        *,
+        data_config=None,
+        num_candidates: int = 20,
+    ):
+        """Apply aimet_onnx sequential MSE technique to sim.
+
+        Args:
+            sim: QuantizationSimModel to optimize.
+            data_config: Dataset to use for optimization. If not specified for the technique, will default to the calibration data.
+            num_candidates: Number of encoding candidates to sweep for each weight.
+
+        """
+        from aimet_onnx import apply_seq_mse
+
+        apply_seq_mse(sim, data_config, num_candidates)
+
+        return sim
+
+
 class AimetQuantization(Pass):
     """Quantize ONNX model using aimet-onnx."""
 
@@ -340,6 +363,6 @@ def _run_for_config(
             )
 
             sim.compute_encodings(calib_dataloader)
-            qdq_model = sim.to_onnx_qdq()
+            qdq_model = sim.to_onnx_qdq(prequantize_constants=True)
 
         return model_proto_to_olive_model(qdq_model, output_model_path, config)
diff --git a/test/passes/onnx/test_aimet_quantization.py b/test/passes/onnx/test_aimet_quantization.py
@@ -155,7 +155,9 @@ def test_aimet_quantization_uses_provided_precisions(tmp_path, precisions):
 
     initializer_dict = {tensor.name: tensor for tensor in model.graph.initializer}
     tensor_to_quantizer = {
-        node.input[0]: node for node in model.graph.node if node.op_type in ("QuantizeLinear", "DequantizeLinear")
+        node.input[0].removesuffix("_q"): node
+        for node in model.graph.node
+        if node.op_type in ("QuantizeLinear", "DequantizeLinear")
     }
 
     # Weight should be symmetrically quantized with precision type
@@ -269,11 +271,11 @@ def test_aimet_quantization_applies_adaround(tmp_path):
     }
     p = create_pass_from_dict(AimetQuantization, config, disable_search=True)
 
-    with patch("aimet_onnx.apply_adaround") as mock_seq_mse:
+    with patch("aimet_onnx.apply_adaround") as mock_adaround:
         out = p.run(input_model, tmp_path)
-        assert mock_seq_mse.call_count == 1
+        assert mock_adaround.call_count == 1
 
-        (_, data, num_iterations, nodes_to_include), _ = mock_seq_mse.call_args
+        (_, data, num_iterations, nodes_to_include), _ = mock_adaround.call_args
         assert isinstance(data, Iterable)
         assert num_iterations == 5
         assert nodes_to_include is None
@@ -305,13 +307,44 @@ def test_aimet_quantization_excludes_adaround_nodes(tmp_path):
     }
     p = create_pass_from_dict(AimetQuantization, config, disable_search=True)
 
-    with patch("aimet_onnx.apply_adaround") as mock_seq_mse:
+    with patch("aimet_onnx.apply_adaround") as mock_adaround:
         p.run(input_model, tmp_path)
-        assert mock_seq_mse.call_count == 1
-        (_, _, _, nodes_to_include), _ = mock_seq_mse.call_args
+        assert mock_adaround.call_count == 1
+        (_, _, _, nodes_to_include), _ = mock_adaround.call_args
         assert not nodes_to_include
 
 
+@pytest.mark.skipif(not IS_LINUX, reason="Only run on linux")
+@pytest.mark.skipif(CUDA_AVAILABLE, reason="Only run on cpu tests")
+def test_aimet_quantization_applies_seq_mse(tmp_path):
+    input_model = dummy_onnx_matmul_model(tmp_path / "dummy_model_mm.onnx")
+    config = {
+        "data_config": DataConfig(
+            name="test_quant_dc_config",
+            load_dataset_config=DataComponentConfig(type="simple_dataset"),
+            dataloader_config=DataComponentConfig(type="_test_quant_dataloader_len_16"),
+        ),
+        "precision": "int4",
+        "techniques": [
+            {
+                "name": "seqmse",
+                "num_candidates": 5,
+            }
+        ],
+    }
+    p = create_pass_from_dict(AimetQuantization, config, disable_search=True)
+
+    with patch("aimet_onnx.apply_seq_mse") as mock_seq_mse:
+        out = p.run(input_model, tmp_path)
+        assert mock_seq_mse.call_count == 1
+
+        (_, data, num_candidates), _ = mock_seq_mse.call_args
+        assert isinstance(data, Iterable)
+        assert num_candidates == 5
+
+    assert out is not None
+
+
 @pytest.mark.skipif(not IS_LINUX, reason="Only run on linux")
 @pytest.mark.skipif(CUDA_AVAILABLE, reason="Only run on cpu tests")
 @pytest.mark.parametrize(
@@ -344,7 +377,7 @@ def test_aimet_quantization_excludes_op_types(tmp_path, op_types, disabled_quant
     model = onnx.load(out.model_path)
 
     tensor_to_quantizer = {
-        tensor: node
+        tensor.removesuffix("_q"): node
         for node in model.graph.node
         for tensor in (node.input[0], node.output[0])
         if node.op_type in ("QuantizeLinear", "DequantizeLinear")
@@ -374,7 +407,9 @@ def test_aimet_quantization_preserves_quantization_in_prequantized_model(tmp_pat
     model = onnx.load(out.model_path)
 
     tensor_to_quantizer = {
-        node.input[0]: node for node in model.graph.node if node.op_type in ("QuantizeLinear", "DequantizeLinear")
+        node.input[0].removesuffix("_q"): node
+        for node in model.graph.node
+        if node.op_type in ("QuantizeLinear", "DequantizeLinear")
     }
 
     weight_quantizer = tensor_to_quantizer["weight_dq"]

Original file line number	Diff line number	Diff line change
`@@ -602,7 +602,7 @@`
`602`	`602`	`}`
`603`	`603`	`},`
`604`	`604`	`"extra_dependencies": {`
`605`		`- "aimet-onnx": [ "aimet-onnx>=2.12.0" ],`
	`605`	`+ "aimet-onnx": [ "aimet-onnx>=2.15.0" ],`
`606`	`606`	`"auto-opt": [ "optimum" ],`
`607`	`607`	`"azureml": [ "azure-ai-ml>=1.11.1", "azure-identity" ],`
`608`	`608`	`"bnb": [ "bitsandbytes", "triton" ],`