diff --git a/docs/source/openvino/export.mdx b/docs/source/openvino/export.mdx index f232dba84c..b51ce235e9 100644 --- a/docs/source/openvino/export.mdx +++ b/docs/source/openvino/export.mdx @@ -31,7 +31,7 @@ Check out the help for more options: ```text usage: optimum-cli export openvino [-h] -m MODEL [--task TASK] [--framework {pt}] [--trust-remote-code] - [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4}] + [--weight-format {fp32,fp16,int8,int4,mxfp4,nf4,fp4,fp8_e4m3,cb4}] [--quant-mode {int8,f8e4m3,f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2}] [--library {transformers,diffusers,timm,sentence_transformers,open_clip}] [--cache_dir CACHE_DIR] [--pad-token-id PAD_TOKEN_ID] [--ratio RATIO] [--sym] @@ -66,7 +66,7 @@ Optional arguments: --trust-remote-code Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository. - --weight-format {fp32,fp16,int8,int4,mxfp4,nf4,cb4} + --weight-format {fp32,fp16,int8,int4,mxfp4,fp4,fp8_e4m3,nf4,cb4} The weight format of the exported model. Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format. --quant-mode {int8,f8e4m3,f8e5m2,cb4_f8e4m3,int4_f8e4m3,int4_f8e5m2} diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index cd3280189e..0e0f123178 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -67,7 +67,7 @@ def parse_args_openvino(parser: "ArgumentParser"): optional_group.add_argument( "--weight-format", type=str, - choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4", "cb4"], + choices=["fp32", "fp16", "int8", "int4", "mxfp4", "fp4", "fp8_e4m3", "nf4", "cb4"], default=None, help=( "The weight format of the exported model. Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format." diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index 1b810d4fec..417510a432 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -834,7 +834,7 @@ class OVWeightQuantizationConfig(OVQuantizationConfigBase): Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original and compressed layers. Providing a dataset is required to run scale estimation. dtype (`str`, *optional*): - Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4', 'cb4']. + Data type weights are compressed to. Possible values: ['int4', 'int8', 'mxfp4', 'nf4', 'cb4', 'fp4', 'fp8_e4m3']. Option 'cb4' represents a codebook with 16 fixed fp8 values in E4M3 format. qptq (`bool`, *optional*): Whether to apply GPTQ algorithm. GPTQ optimizes compressed weights in a layer-wise fashion to minimize the @@ -1040,10 +1040,10 @@ def post_init(self): if self.dtype is None: self.dtype = "int4" if self.bits == 4 else "int8" - if self.dtype not in ["int4", "int8", "mxfp4", "nf4", "cb4"]: + if self.dtype not in ["int4", "int8", "mxfp4", "nf4", "cb4", "fp4", "fp8_e4m3"]: raise ValueError( "Weights quantization data type must be one of the following: " - f"['int4', 'int8', 'mxfp4', 'nf4', 'cb4'], but found: {self.dtype}." + f"['int4', 'int8', 'mxfp4', 'nf4', 'cb4', 'fp4', 'fp8_e4m3'], but found: {self.dtype}." ) if self.dtype in ["mxfp4", "nf4", "cb4"]: if self.bits != 4: diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 4be27f43e5..b93951585c 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -508,6 +508,18 @@ class OVCLIExportTestCase(unittest.TestCase): ] TRANSFORMERS_4BIT_CONFIGURATIONS = [ + ( + "text-generation-with-past", + "llama", + "fp4 --group-size 16", + {"model": {"int8": 4, "f4e2m1": 14}}, + ), + ( + "text-generation-with-past", + "llama", + "fp8_e4m3 --group-size 16", + {"model": {"int8": 4, "f8e4m3": 14}}, + ), ( "text-generation-with-past", "opt125m",