[NNCF] FP8/FP4 support

daniil-lyakhov · daniil-lyakhov · commit d6a2aa97f647 · 2025-11-18T17:28:12.000+01:00
diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx
@@ -31,7 +31,7 @@ Check out the help for more options:
 
 ```text
 usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt}] [--trust-remote-code]
-                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}]
+                                   [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,fp4,fp8_e4m3,cb4}]
                                    [--quant-mode {int8,f8e4m3,f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}]
                                    [--library {transformers,diffusers,timm,sentence_transformers,open_clip}]
                                    [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym]
@@ -66,7 +66,7 @@ Optional arguments:
   --trust-remote-code   Allows to use custom code for the modeling hosted in the model repository. This option should
                         only be set for repositories you trust and in which you have read the code, as it will execute
                         on your local machine arbitrary code present in the model repository.
-  --weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}
+  --weight-format {fp32,fp16,int8,int4,mxfp4,fp4,fp8_e4m3,nf4,cb4}
                         The weight format of the exported model. Option 'cb4' represents a codebook with 16
                         fixed fp8 values in E4M3 format.
   --quant-mode {int8,f8e4m3,f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
@@ -67,7 +67,7 @@ def parse_args_openvino(parser: "ArgumentParser"):
     optional_group.add_argument(
         "--weight-format",
         type=str,
-        choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4", "cb4"],
+        choices=["fp32", "fp16", "int8", "int4", "mxfp4", "fp4", "fp8_e4m3", "nf4", "cb4"],
         default=None,
         help=(
             "The weight format of the exported model. Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format."
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
@@ -686,7 +686,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase):
             Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and
             compressed layers. Providing a dataset is required to run scale estimation.
         dtype (`str`, *optional*):
-            Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4', 'cb4'].
+            Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4', 'cb4', 'fp4', 'fp8_e4m3'].
             Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format.
         qptq (`bool`, *optional*):
             Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the
@@ -879,10 +879,10 @@ def post_init(self):
 
         if self.dtype is None:
             self.dtype = "int4" if self.bits == 4 else "int8"
-        if self.dtype not in ["int4", "int8", "mxfp4", "nf4", "cb4"]:
+        if self.dtype not in ["int4", "int8", "mxfp4", "nf4", "cb4", "fp4", "fp8_e4m3"]:
             raise ValueError(
                 "Weights quantization data type must be one of the following: "
-                f"['int4', 'int8', 'mxfp4', 'nf4', 'cb4'], but found: {self.dtype}."
+                f"['int4', 'int8', 'mxfp4', 'nf4', 'cb4', 'fp4', 'fp8_e4m3'], but found: {self.dtype}."
             )
         if self.dtype in ["mxfp4", "nf4", "cb4"]:
             if self.bits != 4:
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
@@ -494,6 +494,18 @@ class OVCLIExportTestCase(unittest.TestCase):
             "mxfp4",
             {"model": {"int8": 4, "f4e2m1": 72, "f8e8m0": 72}},
         ),
+        (
+            "text-generation-with-past",
+            "opt125m",
+            "fp4",
+            {"model": {"int8": 4, "f4e2m1": 72}},
+        ),
+        (
+            "text-generation-with-past",
+            "opt125m",
+            "fp8_e4m3",
+            {"model": {"int8": 4, "f8e4m3": 72}},
+        ),
         (
             "text-generation-with-past",
             "opt125m",