Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lmms_eval/api/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class Instance:
request_type: Literal["loglikelihood", "generate_until", "generate_until_multi_round", "generate_until_agentic"]
arguments: tuple
idx: int
metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here
metadata: Dict[str, Union[str, int]] = field(default_factory=dict)
resps: list = field(default_factory=list)
filtered_resps: dict = field(default_factory=dict)
raw_filtered_resps: dict = field(default_factory=dict)
Expand Down
18 changes: 10 additions & 8 deletions lmms_eval/caching/response_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

Activation: ``python -m lmms_eval --model ... --tasks ... --use_cache ./eval_cache``

Cache key: sha256(request_type, task_name, doc_id, idx, canonical gen_kwargs).
Cache key: sha256(request_type, task_name, doc_id, idx, canonical gen_kwargs, content_hash).
Scoped per model: ``{use_cache}/{model_hash}/rank{N}.db``
"""

Expand Down Expand Up @@ -164,12 +164,14 @@ def fingerprint_callable(fn) -> str:


def _extract_content_hash(instance: Instance) -> str:
"""Hash the text content of loglikelihood args to prevent collisions.
"""Hash leading text arguments to prevent cache-key collisions.

For multiple_choice with acc_mutual_info, conditional requests have
``(ctx, continuation, ...)`` while unconditional have ``("", choice)``.
Both share the same (task_name, doc_id, idx) so we need this hash
to distinguish them.
Some flows can issue multiple deterministic requests that share the same
``(task_name, doc_id, idx, gen_kwargs)`` while differing in prompt text.
This is common in multi-round / agentic generation loops.

We hash the leading consecutive string arguments (for example context and
continuation) so those requests do not alias to the same cache entry.
"""
args = instance.args
text_parts = []
Expand Down Expand Up @@ -375,7 +377,7 @@ def execute(self, lm: Any, reqtype: str, requests: List[Instance]) -> list:
self._skipped += 1
continue

ch = _extract_content_hash(req) if reqtype == "loglikelihood" else ""
ch = _extract_content_hash(req)
tf = self._task_fingerprints.get(req.task_name, "")
cache_key = compute_cache_key(
request_type=reqtype,
Expand Down Expand Up @@ -406,7 +408,7 @@ def execute(self, lm: Any, reqtype: str, requests: List[Instance]) -> list:
cacheable = self._extract_cacheable(resp)
gen_kwargs = extract_gen_kwargs(req)
deterministic = is_deterministic(reqtype, gen_kwargs)
ch = _extract_content_hash(req) if reqtype == "loglikelihood" else ""
ch = _extract_content_hash(req)
tf = self._task_fingerprints.get(req.task_name, "")
cache_key = compute_cache_key(request_type=reqtype, task_name=req.task_name, doc_id=req.doc_id, gen_kwargs=gen_kwargs, idx=req.idx, content_hash=ch, task_fingerprint=tf) if deterministic else ""
self._log_to_audit(reqtype, req.task_name, req.doc_id, req.idx, gen_kwargs, cacheable, cache_key=cache_key, deterministic=deterministic)
Expand Down
24 changes: 19 additions & 5 deletions lmms_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,12 @@ def _adjust_config(task_dict):
decontaminate_suffix = "_decontaminate"


def _run_generate_until_agentic(lm, requests: list[Instance], agentic_trace_mode: str = "basic") -> list[str]:
def _run_generate_until_agentic(
lm,
requests: list[Instance],
agentic_trace_mode: str = "basic",
response_cache: Optional[ResponseCache] = None,
) -> list[str]:
responses: list[str] = []

for req in requests:
Expand Down Expand Up @@ -522,7 +527,11 @@ def _agentic_doc_to_messages(_doc):
idx=0,
metadata=req.metadata,
)
current_output = lm.generate_until([single_req])[0]
if response_cache is not None:
current_raw_output = response_cache.execute(lm, "generate_until", [single_req])[0]
else:
current_raw_output = lm.generate_until([single_req])[0]
current_output, _ = unwrap_generation_output(current_raw_output)
model_outputs.append(current_output)
final_response = current_output

Expand Down Expand Up @@ -607,7 +616,7 @@ def _agentic_doc_to_messages(_doc):

@positional_deprecated
def evaluate(
lm: "LM",
lm,
task_dict,
limit: Optional[int] = None,
offset: int = 0,
Expand Down Expand Up @@ -694,7 +703,7 @@ def evaluate(
lm.accelerator = Accelerator()

for task_output in eval_tasks:
task: Task = task_output.task
task = task_output.task
task_name = task_output.task_name
task.args = cli_args

Expand Down Expand Up @@ -790,7 +799,12 @@ def evaluate(
trace_mode = "basic"
if cli_args is not None:
trace_mode = getattr(cli_args, "agentic_trace_mode", "basic")
resps = _run_generate_until_agentic(lm, cloned_reqs, agentic_trace_mode=trace_mode)
resps = _run_generate_until_agentic(
lm,
cloned_reqs,
agentic_trace_mode=trace_mode,
response_cache=response_cache,
)
elif response_cache is not None:
resps = response_cache.execute(lm, reqtype, cloned_reqs)
else:
Expand Down
5 changes: 1 addition & 4 deletions lmms_eval/models/chat/bagel_lmms_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ def __init__(
text_temperature: float = 0.3,
seed: int = 0,
image_ratio: str = "1:1",
continual_mode: bool = True,
response_persistent_folder: Optional[str] = None,
device: Optional[str] = "cuda",
device_map: Optional[str] = None,
**kwargs,
Expand All @@ -74,7 +72,6 @@ def __init__(
self.load_in_4bit = load_in_4bit
self.load_in_8bit = load_in_8bit
self.show_thinking = show_thinking
self.continual_mode = continual_mode

# Generation hyperparameters
self.cfg_text_scale = cfg_text_scale
Expand Down Expand Up @@ -106,7 +103,7 @@ def __init__(
self.image_shapes = (1024, 1024)

if output_image_dir is None:
self.output_image_dir = os.path.join(self.response_persistent_folder, "bagel_generated_images")
self.output_image_dir = "./logs/bagel_generated_images"
else:
self.output_image_dir = output_image_dir

Expand Down
29 changes: 8 additions & 21 deletions lmms_eval/models/chat/openai.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import time
from concurrent.futures import FIRST_COMPLETED, ThreadPoolExecutor, wait
from typing import List, Union
Expand Down Expand Up @@ -71,13 +70,14 @@ def generate_until(self, requests) -> List[GenerationResult]:
latencies: List[float] = []
completed_since_adapt = 0
in_flight = {}
doc_uuids: List[Union[str, None]] = [None] * len(reordered_requests)
max_workers = max(
1,
self.adaptive_config.max_concurrency if self.adaptive_concurrency else current_concurrency,
)

def process_single_request(local_index: int, payload: dict):
def process_single_request(local_index: int, payload: dict | None):
if payload is None:
return "", local_index, False, False, 0.0, 0, 0, 0
started_at = time.time()
rate_limited = False
last_error_msg = "unknown error"
Expand Down Expand Up @@ -170,15 +170,9 @@ def maybe_update_concurrency(force: bool = False) -> None:
latencies = []
completed_since_adapt = 0

def build_payload_for_index(global_index: int):
def build_payload_for_index(global_index: int) -> dict:
req = reordered_requests[global_index]
_, doc_to_messages, gen_kwargs, doc_id, task, split = req.args
doc_uuid = f"{task}___{split}___{doc_id}"

if self.continual_mode and self.cache_mode == "resume":
cached_response = self.response_cache.get(doc_uuid)
if cached_response:
return doc_uuid, cached_response, None

chat_messages_raw = doc_to_messages(self.task_dict[task][split][doc_id])
chat_messages: ChatMessages = ChatMessages(**{"messages": chat_messages_raw})
Expand All @@ -199,16 +193,15 @@ def build_payload_for_index(global_index: int):
payload["response_format"] = {"type": "text"}
payload["max_completion_tokens"] = 5000

return doc_uuid, None, payload
return payload

with ThreadPoolExecutor(max_workers=max_workers) as executor:
while cursor < len(dispatch_order) or in_flight:
while cursor < len(dispatch_order) and len(in_flight) < max(1, current_concurrency):
request_index = dispatch_order[cursor]
doc_uuid, cached_response, payload = build_payload_for_index(request_index)
doc_uuids[request_index] = doc_uuid
if cached_response is not None:
responses[request_index] = GenerationResult(text=cached_response, token_counts=TokenCounts())
payload = build_payload_for_index(request_index)
if payload is None:
responses[request_index] = GenerationResult(text="", token_counts=TokenCounts())
pbar.update(1)
cursor += 1
continue
Expand Down Expand Up @@ -255,19 +248,13 @@ def build_payload_for_index(global_index: int):
if rate_limited:
rate_limited_requests += 1
completed_since_adapt += 1
if self.continual_mode and doc_uuids[local_index] is not None:
self.response_cache[doc_uuids[local_index]] = response_text
totals = get_running_totals()
pbar.set_postfix({"tokens": f"{totals['total_tokens']:,}"}, refresh=False)
pbar.update(1)
maybe_update_concurrency(force=False)

maybe_update_concurrency(force=True)

if self.continual_mode:
with open(self.response_persistent_file, "w") as f:
json.dump(self.response_cache, f)

avg_speed = total_tokens / total_latency if total_latency > 0 else 0
log_metrics(
total_elapsed_time=total_latency,
Expand Down
39 changes: 0 additions & 39 deletions lmms_eval/models/simple/claude.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import os
import time
from copy import deepcopy
Expand Down Expand Up @@ -43,8 +42,6 @@ def __init__(
system_prompt: str = "", # Whether you want some special system prompt here
modality: str = "image",
max_frames_num: int = 10,
continual_mode: bool = False,
response_persistent_folder: str = None,
**kwargs,
) -> None:
super().__init__()
Expand All @@ -53,24 +50,6 @@ def __init__(
self.system_prompt = system_prompt
self.modality = modality
self.max_frames_num = max_frames_num

self.continual_mode = continual_mode
if self.continual_mode:
if response_persistent_folder is None:
raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.")

os.makedirs(response_persistent_folder, exist_ok=True)
self.response_persistent_folder = response_persistent_folder
self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")

if os.path.exists(self.response_persistent_file):
with open(self.response_persistent_file, "r") as f:
self.response_cache = json.load(f)
self.cache_mode = "resume"
else:
self.response_cache = {}
self.cache_mode = "start"

accelerator = Accelerator()
if accelerator.num_processes > 1:
assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
Expand Down Expand Up @@ -174,16 +153,6 @@ def generate_until(self, requests) -> List[GenerationResult]:
res.append(GenerationResult(text="", token_counts=None))
pbar.update(1)
continue
###################### CONTINUAL MODE ######################
if self.continual_mode is True and self.cache_mode == "resume":
doc_uuid = f"{task}___{split}___{doc_id}"
if doc_uuid in self.response_cache:
response_text = self.response_cache[doc_uuid]
if response_text:
res.append(GenerationResult(text=response_text, token_counts=None))
pbar.update(1)
continue

visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)
imgs = []
Expand Down Expand Up @@ -271,14 +240,6 @@ def generate_until(self, requests) -> List[GenerationResult]:
res.append(GenerationResult(text=response_text, token_counts=token_counts))
pbar.update(1)

###################### CONTINUAL MODE ######################
if self.continual_mode is True: # Cache the response
response_text = message.content[0].text
doc_uuid = f"{task}___{split}___{doc_id}"
self.response_cache[doc_uuid] = response_text
with open(self.response_persistent_file, "w") as f:
json.dump(self.response_cache, f, indent=4)

pbar.close()

return res
Expand Down
37 changes: 0 additions & 37 deletions lmms_eval/models/simple/gemini_api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import io
import json
import os
import pathlib
import re
Expand Down Expand Up @@ -41,38 +40,17 @@ def __init__(
model_version: str = "gemini-1.5-pro",
# modality: str = "image",
timeout: int = 120,
continual_mode: bool = True,
response_persistent_folder: str = "./logs/gemini_persistent_folder",
interleave: bool = False,
# We will cache the Gemini API response in this path and use it for future requests
**kwargs,
) -> None:
super().__init__()
self.model_version = model_version
self.timeout = timeout
self.model = genai.GenerativeModel(model_version)
self.continual_mode = continual_mode
self.response_persistent_file = ""
self.interleave = interleave
# if self.continual_mode and response_persistent_folder is None:
# raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.")
if self.continual_mode:
self.response_persistent_folder = response_persistent_folder
if not os.path.exists(self.response_persistent_folder):
os.makedirs(self.response_persistent_folder)
self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json")

if os.path.exists(self.response_persistent_file):
with open(self.response_persistent_file, "r") as f:
self.response_cache = json.load(f)
self.cache_mode = "resume"
else:
self.response_cache = {}
self.cache_mode = "start"

accelerator = Accelerator()
if accelerator.num_processes > 1:
assert self.continual_mode is False, "Continual mode is not supported with distributed inference."
assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
self.accelerator = accelerator
if self.accelerator.is_local_main_process:
Expand Down Expand Up @@ -163,15 +141,6 @@ def get_uuid(task, split, doc_id):
res.append(GenerationResult(text="", token_counts=None))
pbar.update(1)
continue
if self.continual_mode and self.cache_mode == "resume":
doc_uuid = get_uuid(task, split, doc_id)
if doc_uuid in self.response_cache:
content = self.response_cache[doc_uuid]
if content:
res.append(GenerationResult(text=content, token_counts=None))
pbar.update(1)
continue

if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if "temperature" not in gen_kwargs:
Expand Down Expand Up @@ -239,12 +208,6 @@ def get_uuid(task, split, doc_id):

self.free_video()

if self.continual_mode is True: # Cache the response
doc_uuid = get_uuid(task, split, doc_id)
self.response_cache[doc_uuid] = content
with open(self.response_persistent_file, "w") as f:
json.dump(self.response_cache, f)

pbar.close()
return res

Expand Down
Loading