Merge branch 'vllm-project:main' into deprecation/torch-dtype-to-dtype

jangel97 · web-flow · commit a089bfb8b722 · 2025-12-14T17:36:10.000+01:00
diff --git a/setup.py b/setup.py
@@ -157,7 +157,7 @@ def localversion_func(version: ScmVersion) -> str:
             "pytest>=6.0.0",
             "pytest-mock>=3.6.0",
             "pytest-rerunfailures>=13.0",
-            "lm_eval==0.4.9",
+            "lm_eval==0.4.9.2",
             # test dependencies
             "beautifulsoup4~=4.12.3",
             "cmarkgfm>=2024.1.14",
diff --git a/src/llmcompressor/entrypoints/model_free/README.md b/src/llmcompressor/entrypoints/model_free/README.md
@@ -0,0 +1,9 @@
+# Quantizing models without a model definition 
+
+`model_free_ptq` provides a PTQ pathway for data-free schemes (such as FP8 Dynamic Per Token or FP8 Block). Specifically, this pathway removes the requirement for a model definition or the need to load the model through transformers. If you are interested in applying a data-free scheme, there are two key scenarios in which applying this pathway may make sense for your model:
+
+1. The model does not have a model definition available through transformers. This may be the case for a brand new model which has not landed in transformers.
+2. The model is very large (such as Kimi K2 Thinking) and is running into issues with `oneshot`
+
+
+`model_free_ptq` works directly with the safetensors in the checkpoint to which observers are applied, thereby removing the requirement for a model definition or transformers.
diff --git a/tests/llmcompressor/modifiers/transform/test_correctness.py b/tests/llmcompressor/modifiers/transform/test_correctness.py
@@ -45,8 +45,7 @@ def test_apply_correctness(
     with torch.no_grad():
         true_output = model(**input)
 
-    modifier.on_initialize(state)
-    modifier.on_start(state, None)
+    modifier.initialize(state)
 
     with torch.no_grad():
         output = model(**input)
diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py
@@ -174,5 +174,4 @@ def hook(module, args):
     with disable_lm_head(model):
         input = {key: value.to("cuda") for key, value in model.dummy_inputs.items()}
         output = model(**input)
-        assert lm_input_device == torch.device("cuda:0")
         assert output.logits.device == torch.device("meta")