diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md index c5a35774a32..7c607241f0c 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -4,9 +4,9 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP ```bash pip install neural-compressor-pt==3.7 # auto-round -pip install auto-round==0.9.3 +pip install "auto-round @ git+https://github.com/intel/auto-round.git@ds-fp8kv" # vLLM -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +git clone -b ds-fp8kv --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv # other requirements pip install -r requirements.txt @@ -27,6 +27,7 @@ bash run_quant.sh --model $MODEL -t mxfp8 --output_dir ./qmodels - MXFP4 ```bash bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels + ``` - NVFP4 @@ -34,6 +35,12 @@ bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels bash run_quant.sh --model $MODEL -t nvfp4 --output_dir ./qmodels ``` +To enable `fp8 kv cache`, please add `-kv fp8`: +```bash +# w/ fp8 kv +bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels -kv fp8 +``` + ## Evaluation ### Prompt Tests diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index 3f5f8e857d2..958592d8f57 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -79,6 +79,7 @@ def quant_model(args): export_format=export_format, output_dir=output_dir, low_gpu_mem_usage=True, + static_kv_dtype=args.static_kv_dtype, reloading=False, ) @@ -111,6 +112,12 @@ def quant_model(args): action="store_true", help="Enable torch compile for the model.", ) + parser.add_argument( + "--static_kv_dtype", + type=str, + default=None, + help="Data type to use KV Cache. e.g. fp8", + ) parser.add_argument( "--use_autoround_format", action="store_true", diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 412d0546391..df59a54a58d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -8,6 +8,7 @@ SCHEME="mxfp8" TASK_NAME="piqa,hellaswag,mmlu" TP_SIZE=8 BATCH_SIZE=512 +KV_CACHE_DTYPE="auto" # Function to display usage usage() { @@ -42,6 +43,10 @@ while [[ $# -gt 0 ]]; do TP_SIZE="$2" shift 2 ;; + -kv) + KV_CACHE_DTYPE="$2" + shift 2 + ;; -b) BATCH_SIZE="$2" shift 2 @@ -71,7 +76,7 @@ OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval" # Create output directory mkdir -p ${OUTPUT_DIR} - +export NCCL_NVLS_ENABLE=0 # Set environment variables based on the quantization scheme if [[ "$SCHEME" == "mxfp4" ]]; then VLLM_AR_MXFP4_MODULAR_MOE=1 @@ -100,6 +105,13 @@ else exit 1 fi +# for fp8 kv cache +if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then + export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=1 + export VLLM_ATTENTION_BACKEND="FLASHINFER_MLA" + echo "Using FP8 for KV cache" +fi + # Run evaluation echo "Evaluating model: ${MODEL_PATH}" echo "Quantization scheme: ${SCHEME}" @@ -117,10 +129,10 @@ VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \ VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \ VLLM_ENABLE_V1_MULTIPROCESSING=1 \ lm_eval --model vllm \ - --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False" \ + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ --tasks $TASK_NAME \ --batch_size $BATCH_SIZE \ --log_samples \ --seed 42 \ --output_path ${OUTPUT_DIR} \ - --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt \ No newline at end of file + --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh index eccb1019249..16fa700cc27 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_generate.sh @@ -8,6 +8,8 @@ set -e QUANT_TYPE="mxfp8" MODEL_PATH="/path/to/quantized_model" TP_SIZE=8 +KV_CACHE_DTYPE="auto" + # Function to display usage usage() { @@ -15,11 +17,13 @@ usage() { echo " -s: Quantization scheme (mxfp4 or mxfp8, default: mxfp8)" echo " -m: Path to quantized model (required)" echo " -tp: Tensor parallelism size (default: 8)" + echo " -kv: Data type for KV cache (default: auto)" echo "" echo "Examples:" echo " $0 -s mxfp4 -m /path/to/my/model -tp 4" echo " $0 -m /path/to/my/model" echo " $0 -s mxfp8 -m /path/to/my/model" + each " $0 -kv fp8 -m /path/to/my/model" } # Parse command line arguments @@ -37,6 +41,10 @@ while [[ $# -gt 0 ]]; do TP_SIZE="$2" shift 2 ;; + -kv) + KV_CACHE_DTYPE="$2" + shift 2 + ;; -h) usage exit 0 @@ -81,6 +89,7 @@ echo " Model: $MODEL_PATH" echo " Tensor Parallelism: $TP_SIZE" echo "" +# Set environment variables based on quantization type # Set environment variables based on quantization type if [[ "$QUANT_TYPE_UPPER" == "MXFP4" ]]; then export VLLM_ENABLE_AR_EXT=1 @@ -99,6 +108,13 @@ else echo "Using MXFP8 configuration" fi +# for fp8 kv cache +if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then + export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=1 + export VLLM_ATTENTION_BACKEND="FLASHINFER_MLA" + echo "Using FP8 for KV cache" +fi + # Common environment variables export VLLM_ENABLE_STATIC_MOE=0 export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=0 @@ -121,4 +137,5 @@ python generate.py \ --max-num-seqs 4 \ --max-model-len 2048 \ --gpu_memory_utilization 0.75 \ - --no-enable-prefix-caching \ No newline at end of file + --no-enable-prefix-caching \ + --kv-cache-dtype $KV_CACHE_DTYPE diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh index e1063815120..2ef5b98fd7b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh @@ -4,11 +4,13 @@ set -e MODEL="" TARGET="" OUTPUT_DIR="" +STATIC_KV_DTYPE="None" usage() { echo "Usage: $0 --model MODEL -t [mxfp4|mxfp8] --output_dir DIR" echo " --model Hugging Face model ID or local path" echo " -t quantization target (e.g. mxfp8, mxfp4)" + echo " -kv datatype for kv cache (auto, fp8)" echo " --output_dir output directory for quantized model" exit 1 } @@ -23,6 +25,10 @@ while [[ $# -gt 0 ]]; do TARGET="$2" shift 2 ;; + -kv) + KV_CACHE_DTYPE="$2" + shift 2 + ;; --output_dir) OUTPUT_DIR="$2" shift 2 @@ -46,4 +52,5 @@ python quantize.py \ --model "$MODEL" \ -t "$TARGET" \ --use_autoround_format \ - --output_dir "$OUTPUT_DIR" \ No newline at end of file + --output_dir "$OUTPUT_DIR" \ + $( [ "$STATIC_KV_DTYPE" != "None" ] && echo "--static_kv_dtype $STATIC_KV_DTYPE" ) \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh new file mode 100644 index 00000000000..d2fa0d478d0 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh @@ -0,0 +1,10 @@ +pip install -r requirements.txt +pip install setuptools --upgrade +pip install packaging --upgrade +pip install -U "huggingface_hub[cli]" +# Install vllm +git clone -b ds-fp8kv --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install --editable . -v +cd .. +# Uninstall flash_attn to avoid conflicts +pip uninstall flash_attn -y \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md index 4c5183e21ee..33b3d540eec 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md @@ -39,6 +39,7 @@ export MODEL=Qwen/Qwen3-30B-A3B bash run_quant.sh --model $MODEL -t mxfp4 --output_dir ./qmodels -kv "fp8" ``` + ## Evaluation ### Prompt Tests diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py index 549919ae5e5..a449aa3c79e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/quantize.py @@ -31,6 +31,12 @@ "fp_layers": "lm_head,mlp.gate,self_attn", "iters": 200, }, + "mxfp4_fp8kv": { + "scheme": "MXFP4", + "fp_layers": "lm_head,mlp.gate,self_attn", + "iters": 0, + "static_kv_dtype": "fp8", + }, } @@ -55,7 +61,8 @@ def quant_model(args): convert, prepare, ) - + if args.t == "mxfp4" and args.kv_cache_dtype == "fp8": + args.t = "mxfp4_fp8kv" config = topologies_config[args.t] export_format = "auto_round" if args.use_autoround_format else "llm_compressor" output_dir = f"{args.output_dir}/quantized_model_{args.t}" @@ -115,7 +122,13 @@ def quant_model(args): action="store_true", help="Use AutoRound format for saving the quantized model.", ) - + parser.add_argument( + "--kv_cache_dtype", + type=str, + choices=["fp8", "auto"], + default="auto", + help="Data type for KV cache. Options are 'fp8' or 'auto'.", + ) parser.add_argument( "--skip_attn", action="store_true", diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index d0039e5ecff..20fd5699eec 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -8,6 +8,7 @@ SCHEME="mxfp8" TASK_NAME="piqa,hellaswag,mmlu" TP_SIZE=8 BATCH_SIZE=512 +KV_CACHE_DTYPE="auto" # Function to display usage usage() { @@ -42,6 +43,10 @@ while [[ $# -gt 0 ]]; do TP_SIZE="$2" shift 2 ;; + -kv) + KV_CACHE_DTYPE="$2" + shift 2 + ;; -b) BATCH_SIZE="$2" shift 2 @@ -93,6 +98,13 @@ else exit 1 fi +# for fp8 kv cache +if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then + export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=1 + export VLLM_ATTENTION_BACKEND="FLASHINFER" + echo "Using FP8 for KV cache" +fi + # Run evaluation echo "Evaluating model: ${MODEL_PATH}" echo "Quantization scheme: ${SCHEME}" @@ -110,7 +122,7 @@ VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \ VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \ VLLM_ENABLE_V1_MULTIPROCESSING=1 \ lm_eval --model vllm \ - --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False" \ + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ --tasks $TASK_NAME \ --batch_size $BATCH_SIZE \ --log_samples \ diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh index 863052d0f68..514fa4fcd3d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_generate.sh @@ -99,6 +99,13 @@ else echo "Using MXFP8 configuration" fi +# for fp8 kv cache +if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then + export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=1 + export VLLM_ATTENTION_BACKEND="FLASHINFER" + echo "Using FP8 for KV cache" +fi + # Common environment variables export VLLM_ENABLE_AR_EXT=1 export VLLM_ENABLE_STATIC_MOE=0 @@ -123,4 +130,4 @@ python generate.py \ --gpu_memory_utilization 0.75 \ --no-enable-prefix-caching \ --enable_expert_parallel \ - --kv-cache-dtype $KV_CACHE_DTYPE \ No newline at end of file + --kv-cache-dtype $KV_CACHE_DTYPE diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh index 050aeb6b077..0cbea9f4b7b 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh @@ -55,4 +55,4 @@ python quantize.py \ -t "$TARGET" \ --use_autoround_format \ --output_dir "$OUTPUT_DIR" \ - $( [ "$STATIC_KV_DTYPE" != "None" ] && echo "--static_kv_dtype $STATIC_KV_DTYPE" ) \ No newline at end of file + $( [ "$STATIC_KV_DTYPE" != "None" ] && echo "--static_kv_dtype $STATIC_KV_DTYPE" ) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/setup.sh new file mode 100644 index 00000000000..d2fa0d478d0 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/setup.sh @@ -0,0 +1,10 @@ +pip install -r requirements.txt +pip install setuptools --upgrade +pip install packaging --upgrade +pip install -U "huggingface_hub[cli]" +# Install vllm +git clone -b ds-fp8kv --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork +VLLM_USE_PRECOMPILED=1 pip install --editable . -v +cd .. +# Uninstall flash_attn to avoid conflicts +pip uninstall flash_attn -y \ No newline at end of file