fix comparison

pythongiant · pythongiant · commit b6fe2d9ff414 · 2026-04-27T13:45:12.000+05:30
diff --git a/benchmarks_and_experiments/important/accuracy_benchmark.py b/benchmarks_and_experiments/important/accuracy_benchmark.py
@@ -435,7 +435,7 @@ def _run_vllm_prefixcache(samples: List[Dict], model: str, max_new_tokens: int =
     log.info("[vllm_prefixcache accuracy] submitting %d prompts in batches of %d ...", n, _VLLM_BATCH)
     prompts = [_format_prompt(s["context"], s["input"], s.get("choices")) for s in samples]
     llm = LLM(model=model, enable_prefix_caching=True,
-              max_model_len=max_context_tokens + 128,
+              max_model_len=max_context_tokens + 512,
               gpu_memory_utilization=gpu_memory_utilization,
               enforce_eager=enforce_eager,
               max_num_seqs=max_num_seqs)
diff --git a/benchmarks_and_experiments/important/comparison_benchmark.py b/benchmarks_and_experiments/important/comparison_benchmark.py
@@ -353,7 +353,7 @@ def _run_vllm(
 
         tokenizer = AutoTokenizer.from_pretrained(self.model_name)
         llm = LLM(model=self.model_name, enable_prefix_caching=True,
-                  max_model_len=max_context_tokens + 128,
+                  max_model_len=max_context_tokens + 512,
                   gpu_memory_utilization=0.95,
                   enforce_eager=True,
                   max_num_seqs=1)
diff --git a/benchmarks_and_experiments/important/latency_benchmark.py b/benchmarks_and_experiments/important/latency_benchmark.py
@@ -371,7 +371,7 @@ def _measure_vllm_prefixcache(
 
     tokenizer = AutoTokenizer.from_pretrained(model)
     llm = LLM(model=model, enable_prefix_caching=enable_prefix_caching,
-              max_model_len=max_context_tokens + 128,
+              max_model_len=max_context_tokens + 512,
               gpu_memory_utilization=gpu_memory_utilization,
               enforce_eager=enforce_eager,
               max_num_seqs=max_num_seqs)
diff --git a/benchmarks_and_experiments/important/memory_benchmark.py b/benchmarks_and_experiments/important/memory_benchmark.py
@@ -397,7 +397,7 @@ def _measure_vllm_prefixcache(
 
     tokenizer = AutoTokenizer.from_pretrained(model)
     llm = LLM(model=model, enable_prefix_caching=True,
-              max_model_len=max_context_tokens + 128,
+              max_model_len=max_context_tokens + 512,
               gpu_memory_utilization=gpu_memory_utilization,
               enforce_eager=enforce_eager,
               max_num_seqs=max_num_seqs)