vllm-project · dsikka · Mar 17, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 5, 2026
diff --git a/examples/awq/RESULTS.md b/examples/awq/RESULTS.md
@@ -0,0 +1,67 @@
+# AWQ + FP8 Quantization Results
+
+Closes #2305
+
+**Model:** Meta-Llama-3-8B-Instruct
+**Hardware:** 8x NVIDIA A100-SXM4-80GB
+**Date:** Feb 10, 2026
+
+## Summary
+
+Ran the example scripts with both FP8 schemes (FP8_DYNAMIC and FP8_BLOCK) on Meta-Llama-3-8B-Instruct, then evaluated on GSM8K as requested in #2305. FP8_DYNAMIC performs better overall.
+
+This PR adds:
+- `gsm8k_eval.py` - evaluation script for running GSM8K benchmarks
+- `RESULTS.md` - results and reproducible workflow
+
+## GSM8K Results
+
+| Scheme | Strict Match | Flexible Extract |
+|--------|-------------|------------------|
+| **FP8_DYNAMIC** | **76.42%** | **76.19%** |
+| **FP8_BLOCK** | 75.21% | 74.98% |
+
+FP8_DYNAMIC wins by ~1.2% on strict matching. Both achieve similar performance on flexible extraction.
+
+**Evaluation details:**
+- 1,319 test samples
+- Batch size: 16
+- Model: Meta-Llama-3-8B-Instruct
+
+## Model Checkpoints
+
+- FP8_DYNAMIC: https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-awq-asym-fp8-dynamic
+- FP8_BLOCK: https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-awq-asym-fp8-block
+
+## Setup
+
+Use the existing example scripts from the repo:
+```bash
+cd examples/awq
+python fp8_dynamic_llama_example.py
+python fp8_block_llama_example.py
+```
+
+## Evaluation
+
+Use `gsm8k_eval.py` for running benchmarks:
+
+```bash
+python gsm8k_eval.py <model_path>
+```
+
+Or directly with lm-eval:
+```bash
+lm_eval \
+  --model hf \
+  --model_args pretrained=<model_path>,dtype=auto \
+  --tasks gsm8k \
+  --batch_size 16 \
+  --output_path <output_dir>
+```
+
+**Important:** Setting `batch_size=16` is critical. The default `auto` picks 1, which significantly increases evaluation time.
+
+## Recommendation
+
+**Use FP8_DYNAMIC** for AWQ quantization - better accuracy preservation (76.42% vs 75.21% on GSM8K strict matching) with similar model characteristics.
diff --git a/examples/awq/gsm8k_eval.py b/examples/awq/gsm8k_eval.py
@@ -0,0 +1,61 @@
+"""
+GSM8K evaluation script for AWQ+FP8 quantized models.
+
+Usage:
+    python gsm8k_eval.py <model_path>
+
+Example:
+    python gsm8k_eval.py ./Qwen2.5-0.5B-Instruct-awq-fp8-dynamic
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+
+
+def evaluate_model(model_path):
+    """Run GSM8K eval using lm-eval."""
+    print(f"\nEvaluating {model_path} on GSM8K...")
+
+    # Output dir based on model path
+    output_dir = os.path.basename(model_path.rstrip("/")) + "_gsm8k_results"
+
+    # Run lm-eval with batch_size=16
+    # Note: Don't use batch_size=auto, it defaults to 1 which is super slow
+    cmd = [
+        "lm_eval",
+        "--model",
+        "hf",
+        "--model_args",
+        f"pretrained={model_path},dtype=auto",
+        "--tasks",
+        "gsm8k",
+        "--batch_size",
+        "16",
+        "--output_path",
+        output_dir,
+    ]
+
+    try:
+        subprocess.run(cmd, check=True)
+        print(f"\nResults saved to {output_dir}/")
+    except subprocess.CalledProcessError as e:
+        print(f"Evaluation failed: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Eval quantized models on GSM8K")
+    parser.add_argument("model_path", help="Path to quantized model directory")
+    args = parser.parse_args()
+
+    if not os.path.isdir(args.model_path):
+        print(f"Error: Model path not found: {args.model_path}", file=sys.stderr)
+        sys.exit(1)
+
+if not os.path.isdir(args.model_path):
+    print(f"Error: Model path not found: {args.model_path}", file=sys.stderr)
+    sys.exit(1)
+
+evaluate_model(args.model_path)
diff --git a/examples/quantization_w8a8_fp8/qwen3_reranker_example.py b/examples/quantization_w8a8_fp8/qwen3_reranker_example.py
@@ -61,7 +61,7 @@
 scores = outputs.logits[:, -1, :].max(dim=-1).values
 
 for i, (doc, score) in enumerate(zip(documents, scores)):
-    print(f"Document {i+1} score: {score.item():.4f}")
+    print(f"Document {i + 1} score: {score.item():.4f}")
     print(f"  Content: {doc[:80]}...")
 
 print("==========================================")