kubeedge · yashasviyadav30 · Mar 1, 2026 · gemini-code-assist · Mar 1, 2026 · gemini-code-assist
diff --git a/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py b/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/base_llm.py
@@ -83,6 +83,7 @@ def _parse_kwargs(self, **kwargs):
             - `repetition_penalty`: float, default 1.05. Repetition penalty
             - `max_tokens`: int, default 512. Maximum tokens to generate
             - `use_cache`: bool, default True. Whether to use reponse cache
+            - `quantization`: str, default None. Quantization method (e.g., 'bitsandbytes', 'awq', 'gptq')
         """
 
         self.model_name = kwargs.get("model", None)
@@ -91,7 +92,8 @@ def _parse_kwargs(self, **kwargs):
         self.repetition_penalty = kwargs.get("repetition_penalty", 1.05)
         self.max_tokens = kwargs.get("max_tokens", 512)
         self.use_cache = kwargs.get("use_cache", True)
-
+        self.quantization = kwargs.get("quantization", None)
+
     def inference(self, data):
         """Inference the model
 

diff --git a/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/vllm_llm.py b/...loud-edge-collaborative-inference-for-llm/testalgorithms/query-routing/models/vllm_llm.py
@@ -53,16 +53,19 @@ def _load(self, model):
         model : str
             Hugging Face style model name. Example: `Qwen/Qwen2.5-0.5B-Instruct`
         """
-        self.model = LLM(
-            model=model,
-            trust_remote_code=True,
-            dtype="float16",
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
-            max_model_len = 8192
-            #quantization=self.quantization # TODO need to align with vllm API
-        )
-
+        llm_kwargs = {
+            "model": model,
+            "trust_remote_code": True,
+            "dtype": "float16",
+            "tensor_parallel_size": self.tensor_parallel_size,
+            "gpu_memory_utilization": self.gpu_memory_utilization,
+            "max_model_len": 8192
+        }
+
+        if self.quantization:
+            llm_kwargs["quantization"] = self.quantization
+
+        self.model = LLM(**llm_kwargs)
         self.sampling_params = SamplingParams(
             temperature=self.temperature,
             top_p=self.top_p,