diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 6124b15023e..74ff5e0820a 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -13,18 +13,6 @@ pip install auto-round==0.9.3 pip install -r requirements.txt ``` -**Before neural-compressor v3.7 and auto-round v0.9.1 release, please install from source for the latest updates:** - -```bash -# neural-compressor-pt -INC_PT_ONLY=1 pip install git+https://github.com/intel/neural-compressor.git@master -# auto-round -pip install git+https://github.com/intel/auto-round.git@v0.9.3rc -# other requirements -pip install -r requirements.txt -``` - - ## Quantization ### Demo (`MXFP4`, `MXFP8`, `NVFP4`, `uNVFP4`) @@ -92,7 +80,8 @@ Here we provide several recipes for Llama3 models. The relative accuracy loss of #### Llama 3.1 8B MXFP8 -AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default. +RTN (Round-to-Nearest) is enough to keep accuracy. + ```bash # Quantize and export AutoRound format CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt index b77a95272b8..19b56908315 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/requirements.txt @@ -1,8 +1,7 @@ -transformers==4.56.2 -torch==2.7.0 -torchvision==0.22.0 -lm_eval==0.4.9.1 -datasets==3.6.0 -deepspeed==0.17.6 -auto-round>=0.8.0 -neural-compressor-pt>=3.6 +transformers==4.57.3 +torch==2.9.0 +torchvision==0.24.0 +lm_eval==0.4.9.2 +datasets==4.4.2 +auto-round>=0.9.3 +neural-compressor-pt>=3.7 diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index 5ac00da274a..14d004b8e8a 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -50,14 +50,13 @@ case "$TOPOLOGY" in case "$DTYPE" in "mxfp8") echo "Running Llama 3.1 8B MXFP8 quantization..." - CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 1000 --nsamples 512 --export_path \"$OUTPUT_MODEL\"" + CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\"" echo "Executing command: $CMD" python quantize.py \ --model_name_or_path "$INPUT_MODEL" \ $COMMON_ARGS \ --dtype MXFP8 \ - --iters 1000 \ - --nsamples 512 \ + --iters 0 \ --export_path "$OUTPUT_MODEL" ;; "mxfp4")