EvolvingLMMs-Lab · Luodian · Feb 21, 2026 · Feb 20, 2026
diff --git a/lmms_eval/api/instance.py b/lmms_eval/api/instance.py
@@ -71,7 +71,7 @@ class Instance:
     request_type: Literal["loglikelihood", "generate_until", "generate_until_multi_round", "generate_until_agentic"]
     arguments: tuple
     idx: int
-    metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
+    metadata: Dict[str, Union[str, int]] = field(default_factory=dict)
     resps: list = field(default_factory=list)
     filtered_resps: dict = field(default_factory=dict)
     raw_filtered_resps: dict = field(default_factory=dict)

diff --git a/lmms_eval/caching/response_cache.py b/lmms_eval/caching/response_cache.py
@@ -6,7 +6,7 @@
 
 Activation: ``python -m lmms_eval --model ... --tasks ... --use_cache ./eval_cache``
 
-Cache key: sha256(request_type, task_name, doc_id, idx, canonical gen_kwargs).
+Cache key: sha256(request_type, task_name, doc_id, idx, canonical gen_kwargs, content_hash).
 Scoped per model: ``{use_cache}/{model_hash}/rank{N}.db``
 """
 
@@ -164,12 +164,14 @@ def fingerprint_callable(fn) -> str:
 
 
 def _extract_content_hash(instance: Instance) -> str:
-    """Hash the text content of loglikelihood args to prevent collisions.
+    """Hash leading text arguments to prevent cache-key collisions.
 
-    For multiple_choice with acc_mutual_info, conditional requests have
-    ``(ctx, continuation, ...)`` while unconditional have ``("", choice)``.
-    Both share the same (task_name, doc_id, idx) so we need this hash
-    to distinguish them.
+    Some flows can issue multiple deterministic requests that share the same
+    ``(task_name, doc_id, idx, gen_kwargs)`` while differing in prompt text.
+    This is common in multi-round / agentic generation loops.
+
+    We hash the leading consecutive string arguments (for example context and
+    continuation) so those requests do not alias to the same cache entry.
     """
     args = instance.args
     text_parts = []
@@ -375,7 +377,7 @@ def execute(self, lm: Any, reqtype: str, requests: List[Instance]) -> list:
                 self._skipped += 1
                 continue
 
-            ch = _extract_content_hash(req) if reqtype == "loglikelihood" else ""
+            ch = _extract_content_hash(req)
             tf = self._task_fingerprints.get(req.task_name, "")
             cache_key = compute_cache_key(
                 request_type=reqtype,
@@ -406,7 +408,7 @@ def execute(self, lm: Any, reqtype: str, requests: List[Instance]) -> list:
                 cacheable = self._extract_cacheable(resp)
                 gen_kwargs = extract_gen_kwargs(req)
                 deterministic = is_deterministic(reqtype, gen_kwargs)
-                ch = _extract_content_hash(req) if reqtype == "loglikelihood" else ""
+                ch = _extract_content_hash(req)
                 tf = self._task_fingerprints.get(req.task_name, "")
                 cache_key = compute_cache_key(request_type=reqtype, task_name=req.task_name, doc_id=req.doc_id, gen_kwargs=gen_kwargs, idx=req.idx, content_hash=ch, task_fingerprint=tf) if deterministic else ""
                 self._log_to_audit(reqtype, req.task_name, req.doc_id, req.idx, gen_kwargs, cacheable, cache_key=cache_key, deterministic=deterministic)

diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
@@ -463,7 +463,12 @@ def _adjust_config(task_dict):
 decontaminate_suffix = "_decontaminate"
 
 
-def _run_generate_until_agentic(lm, requests: list[Instance], agentic_trace_mode: str = "basic") -> list[str]:
+def _run_generate_until_agentic(
+    lm,
+    requests: list[Instance],
+    agentic_trace_mode: str = "basic",
+    response_cache: Optional[ResponseCache] = None,
+) -> list[str]:
     responses: list[str] = []
 
     for req in requests:
@@ -522,7 +527,11 @@ def _agentic_doc_to_messages(_doc):
                     idx=0,
                     metadata=req.metadata,
                 )
-            current_output = lm.generate_until([single_req])[0]
+            if response_cache is not None:
+                current_raw_output = response_cache.execute(lm, "generate_until", [single_req])[0]
+            else:
+                current_raw_output = lm.generate_until([single_req])[0]
+            current_output, _ = unwrap_generation_output(current_raw_output)
             model_outputs.append(current_output)
             final_response = current_output
 
@@ -607,7 +616,7 @@ def _agentic_doc_to_messages(_doc):
 
 @positional_deprecated
 def evaluate(
-    lm: "LM",
+    lm,
     task_dict,
     limit: Optional[int] = None,
     offset: int = 0,
@@ -694,7 +703,7 @@ def evaluate(
         lm.accelerator = Accelerator()
 
     for task_output in eval_tasks:
-        task: Task = task_output.task
+        task = task_output.task
         task_name = task_output.task_name
         task.args = cli_args
 
@@ -790,7 +799,12 @@ def evaluate(
             trace_mode = "basic"
             if cli_args is not None:
                 trace_mode = getattr(cli_args, "agentic_trace_mode", "basic")
-            resps = _run_generate_until_agentic(lm, cloned_reqs, agentic_trace_mode=trace_mode)
+            resps = _run_generate_until_agentic(
+                lm,
+                cloned_reqs,
+                agentic_trace_mode=trace_mode,
+                response_cache=response_cache,
+            )
         elif response_cache is not None:
             resps = response_cache.execute(lm, reqtype, cloned_reqs)
         else:

diff --git a/lmms_eval/models/chat/bagel_lmms_engine.py b/lmms_eval/models/chat/bagel_lmms_engine.py
@@ -62,8 +62,6 @@ def __init__(
         text_temperature: float = 0.3,
         seed: int = 0,
         image_ratio: str = "1:1",
-        continual_mode: bool = True,
-        response_persistent_folder: Optional[str] = None,
         device: Optional[str] = "cuda",
         device_map: Optional[str] = None,
         **kwargs,
@@ -74,7 +72,6 @@ def __init__(
         self.load_in_4bit = load_in_4bit
         self.load_in_8bit = load_in_8bit
         self.show_thinking = show_thinking
-        self.continual_mode = continual_mode
 
         # Generation hyperparameters
         self.cfg_text_scale = cfg_text_scale
@@ -106,7 +103,7 @@ def __init__(
             self.image_shapes = (1024, 1024)
 
         if output_image_dir is None:
-            self.output_image_dir = os.path.join(self.response_persistent_folder, "bagel_generated_images")
+            self.output_image_dir = "./logs/bagel_generated_images"
         else:
             self.output_image_dir = output_image_dir
 

diff --git a/lmms_eval/models/chat/openai.py b/lmms_eval/models/chat/openai.py
@@ -1,4 +1,3 @@
-import json
 import time
 from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
 from typing import List, Union
@@ -71,13 +70,14 @@ def generate_until(self, requests) -> List[GenerationResult]:
         latencies: List[float] = []
         completed_since_adapt = 0
         in_flight = {}
-        doc_uuids: List[Union[str, None]] = [None] * len(reordered_requests)
         max_workers = max(
             1,
             self.adaptive_config.max_concurrency if self.adaptive_concurrency else current_concurrency,
         )
 
-        def process_single_request(local_index: int, payload: dict):
+        def process_single_request(local_index: int, payload: dict | None):
+            if payload is None:
+                return "", local_index, False, False, 0.0, 0, 0, 0
             started_at = time.time()
             rate_limited = False
             last_error_msg = "unknown error"
@@ -170,15 +170,9 @@ def maybe_update_concurrency(force: bool = False) -> None:
             latencies = []
             completed_since_adapt = 0
 
-        def build_payload_for_index(global_index: int):
+        def build_payload_for_index(global_index: int) -> dict:
             req = reordered_requests[global_index]
             _, doc_to_messages, gen_kwargs, doc_id, task, split = req.args
-            doc_uuid = f"{task}___{split}___{doc_id}"
-
-            if self.continual_mode and self.cache_mode == "resume":
-                cached_response = self.response_cache.get(doc_uuid)
-                if cached_response:
-                    return doc_uuid, cached_response, None
 
             chat_messages_raw = doc_to_messages(self.task_dict[task][split][doc_id])
             chat_messages: ChatMessages = ChatMessages(**{"messages": chat_messages_raw})
@@ -199,16 +193,15 @@ def build_payload_for_index(global_index: int):
                 payload["response_format"] = {"type": "text"}
                 payload["max_completion_tokens"] = 5000
 
-            return doc_uuid, None, payload
+            return payload
 
         with ThreadPoolExecutor(max_workers=max_workers) as executor:
             while cursor < len(dispatch_order) or in_flight:
                 while cursor < len(dispatch_order) and len(in_flight) < max(1, current_concurrency):
                     request_index = dispatch_order[cursor]
-                    doc_uuid, cached_response, payload = build_payload_for_index(request_index)
-                    doc_uuids[request_index] = doc_uuid
-                    if cached_response is not None:
-                        responses[request_index] = GenerationResult(text=cached_response, token_counts=TokenCounts())
+                    payload = build_payload_for_index(request_index)
+                    if payload is None:
+                        responses[request_index] = GenerationResult(text="", token_counts=TokenCounts())
                         pbar.update(1)
                         cursor += 1
                         continue
@@ -255,19 +248,13 @@ def build_payload_for_index(global_index: int):
                     if rate_limited:
                         rate_limited_requests += 1
                     completed_since_adapt += 1
-                    if self.continual_mode and doc_uuids[local_index] is not None:
-                        self.response_cache[doc_uuids[local_index]] = response_text
                     totals = get_running_totals()
                     pbar.set_postfix({"tokens": f"{totals['total_tokens']:,}"}, refresh=False)
                     pbar.update(1)
                     maybe_update_concurrency(force=False)
 
         maybe_update_concurrency(force=True)
 
-        if self.continual_mode:
-            with open(self.response_persistent_file, "w") as f:
-                json.dump(self.response_cache, f)
-
         avg_speed = total_tokens / total_latency if total_latency > 0 else 0
         log_metrics(
             total_elapsed_time=total_latency,

diff --git a/lmms_eval/models/simple/claude.py b/lmms_eval/models/simple/claude.py
@@ -1,4 +1,3 @@
-import json
 import os
 import time
 from copy import deepcopy
@@ -43,8 +42,6 @@ def __init__(
         system_prompt: str = "",  # Whether you want some special system prompt here
         modality: str = "image",
         max_frames_num: int = 10,
-        continual_mode: bool = False,
-        response_persistent_folder: str = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -53,24 +50,6 @@ def __init__(
         self.system_prompt = system_prompt
         self.modality = modality
         self.max_frames_num = max_frames_num
-
-        self.continual_mode = continual_mode
-        if self.continual_mode:
-            if response_persistent_folder is None:
-                raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")
-
-            os.makedirs(response_persistent_folder, exist_ok=True)
-            self.response_persistent_folder = response_persistent_folder
-            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
-
-            if os.path.exists(self.response_persistent_file):
-                with open(self.response_persistent_file, "r") as f:
-                    self.response_cache = json.load(f)
-                self.cache_mode = "resume"
-            else:
-                self.response_cache = {}
-                self.cache_mode = "start"
-
         accelerator = Accelerator()
         if accelerator.num_processes > 1:
             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
@@ -174,16 +153,6 @@ def generate_until(self, requests) -> List[GenerationResult]:
                 res.append(GenerationResult(text="", token_counts=None))
                 pbar.update(1)
                 continue
-            ###################### CONTINUAL MODE ######################
-            if self.continual_mode is True and self.cache_mode == "resume":
-                doc_uuid = f"{task}___{split}___{doc_id}"
-                if doc_uuid in self.response_cache:
-                    response_text = self.response_cache[doc_uuid]
-                    if response_text:
-                        res.append(GenerationResult(text=response_text, token_counts=None))
-                        pbar.update(1)
-                        continue
-
             visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
             visuals = self.flatten(visuals)
             imgs = []
@@ -271,14 +240,6 @@ def generate_until(self, requests) -> List[GenerationResult]:
             res.append(GenerationResult(text=response_text, token_counts=token_counts))
             pbar.update(1)
 
-            ###################### CONTINUAL MODE ######################
-            if self.continual_mode is True:  # Cache the response
-                response_text = message.content[0].text
-                doc_uuid = f"{task}___{split}___{doc_id}"
-                self.response_cache[doc_uuid] = response_text
-                with open(self.response_persistent_file, "w") as f:
-                    json.dump(self.response_cache, f, indent=4)
-
         pbar.close()
 
         return res

diff --git a/lmms_eval/models/simple/gemini_api.py b/lmms_eval/models/simple/gemini_api.py
@@ -1,5 +1,4 @@
 import io
-import json
 import os
 import pathlib
 import re
@@ -41,38 +40,17 @@ def __init__(
         model_version: str = "gemini-1.5-pro",
         # modality: str = "image",
         timeout: int = 120,
-        continual_mode: bool = True,
-        response_persistent_folder: str = "./logs/gemini_persistent_folder",
         interleave: bool = False,
-        # We will cache the Gemini API response in this path and use it for future requests
         **kwargs,
     ) -> None:
         super().__init__()
         self.model_version = model_version
         self.timeout = timeout
         self.model = genai.GenerativeModel(model_version)
-        self.continual_mode = continual_mode
-        self.response_persistent_file = ""
         self.interleave = interleave
-        # if self.continual_mode and response_persistent_folder is None:
-        #     raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
-        if self.continual_mode:
-            self.response_persistent_folder = response_persistent_folder
-            if not os.path.exists(self.response_persistent_folder):
-                os.makedirs(self.response_persistent_folder)
-            self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")
-
-        if os.path.exists(self.response_persistent_file):
-            with open(self.response_persistent_file, "r") as f:
-                self.response_cache = json.load(f)
-            self.cache_mode = "resume"
-        else:
-            self.response_cache = {}
-            self.cache_mode = "start"
 
         accelerator = Accelerator()
         if accelerator.num_processes > 1:
-            assert self.continual_mode is False, "Continual mode is not supported with distributed inference."
             assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
             self.accelerator = accelerator
             if self.accelerator.is_local_main_process:
@@ -163,15 +141,6 @@ def get_uuid(task, split, doc_id):
                 res.append(GenerationResult(text="", token_counts=None))
                 pbar.update(1)
                 continue
-            if self.continual_mode and self.cache_mode == "resume":
-                doc_uuid = get_uuid(task, split, doc_id)
-                if doc_uuid in self.response_cache:
-                    content = self.response_cache[doc_uuid]
-                    if content:
-                        res.append(GenerationResult(text=content, token_counts=None))
-                        pbar.update(1)
-                        continue
-
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs:
@@ -239,12 +208,6 @@ def get_uuid(task, split, doc_id):
 
             self.free_video()
 
-            if self.continual_mode is True:  # Cache the response
-                doc_uuid = get_uuid(task, split, doc_id)
-                self.response_cache[doc_uuid] = content
-                with open(self.response_persistent_file, "w") as f:
-                    json.dump(self.response_cache, f)
-
         pbar.close()
         return res