diff --git a/examples/models/openrouter_ice_smoke.sh b/examples/models/openrouter_ice_smoke.sh new file mode 100755 index 000000000..4ee6e0991 --- /dev/null +++ b/examples/models/openrouter_ice_smoke.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +set -euo pipefail + +export OPENROUTER_API_KEY="${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}" + +MODEL_VERSION="${MODEL_VERSION:-google/gemini-2.5-flash-image}" +TASKS="${TASKS:-ice_bench}" +LIMIT="${LIMIT:-1}" +OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_ice_smoke}" +IMAGE_OUTPUT_DIR="${IMAGE_OUTPUT_DIR:-./logs/openrouter_ice_images}" +USE_OFFICIAL_ICE_SAMPLE="${USE_OFFICIAL_ICE_SAMPLE:-1}" + +mkdir -p "${OUTPUT_PATH}" "${IMAGE_OUTPUT_DIR}" + +if [[ "${USE_OFFICIAL_ICE_SAMPLE}" == "1" ]]; then +uv run python - <<'PY' +import json +import zipfile +from pathlib import Path + +from huggingface_hub import hf_hub_download + +zip_path = hf_hub_download( + repo_id="ali-vilab/ICE-Bench", + repo_type="dataset", + filename="dataset.zip", + token=False, +) + +target_jsonl = Path("/tmp/ice_bench_smoke.jsonl") +target_dir = Path("/tmp/ice_bench_smoke_data") +target_dir.mkdir(parents=True, exist_ok=True) + +with zipfile.ZipFile(zip_path) as zf: + with zf.open("data/data.jsonl") as fh: + first = json.loads(next(fh)) + + src_rel = first["SourceImage"] + instruction = first["Instruction"] + item_id = first["ItemID"] + + src_out = target_dir / f"{item_id}_src.png" + with zf.open(src_rel) as src_in: + src_out.write_bytes(src_in.read()) + +record = { + "item_id": item_id, + "instruction": instruction, + "source_image": str(src_out), +} +target_jsonl.write_text(json.dumps(record, ensure_ascii=False) + "\n", encoding="utf-8") +print(f"Prepared smoke data at {target_jsonl}") +print(f"Source image at {src_out}") +PY +fi + +echo "[INFO] Running ICE smoke with model=${MODEL_VERSION} tasks=${TASKS}" + +uv run python -m lmms_eval \ + --model openrouter_image_gen \ + --model_args "model_version=${MODEL_VERSION},output_dir=${IMAGE_OUTPUT_DIR},max_new_tokens=4096,image_size=1024x1024" \ + --tasks "${TASKS}" \ + --batch_size 1 \ + --limit "${LIMIT}" \ + --output_path "${OUTPUT_PATH}" \ + --log_samples \ + --verbosity INFO + +echo "[INFO] Done. Generated images in ${IMAGE_OUTPUT_DIR}/ice_bench" diff --git a/examples/models/openrouter_image_smoke.sh b/examples/models/openrouter_image_smoke.sh new file mode 100755 index 000000000..b0a178a55 --- /dev/null +++ b/examples/models/openrouter_image_smoke.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +set -euo pipefail + +export OPENAI_API_KEY="${OPENAI_API_KEY:-${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}}" +export OPENAI_API_BASE="${OPENAI_API_BASE:-https://openrouter.ai/api/v1}" + +MODEL_VERSION="${MODEL_VERSION:-google/gemini-2.5-flash-image}" +TASKS="${TASKS:-ice_bench}" +LIMIT="${LIMIT:-1}" +OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_image_smoke}" +IMAGE_OUTPUT_DIR="${IMAGE_OUTPUT_DIR:-./logs/openrouter_image_outputs}" + +echo "[INFO] OpenRouter image smoke" +echo "[INFO] model=${MODEL_VERSION} tasks=${TASKS} limit=${LIMIT}" +echo "[INFO] output_path=${OUTPUT_PATH} image_output_dir=${IMAGE_OUTPUT_DIR}" + +uv run python -m lmms_eval \ + --model openrouter_image_gen \ + --model_args "model_version=${MODEL_VERSION},output_dir=${IMAGE_OUTPUT_DIR},max_new_tokens=900,image_size=1024x1024" \ + --tasks "${TASKS}" \ + --batch_size 1 \ + --limit "${LIMIT}" \ + --output_path "${OUTPUT_PATH}" \ + --log_samples \ + --process_with_media \ + --verbosity INFO + +echo "[INFO] Done. Generated images under: ${IMAGE_OUTPUT_DIR}" diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index b46192725..a13d276a5 100644 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -66,6 +66,7 @@ "ola": "Ola", "omnivinci": "OmniVinci", "openai": "OpenAICompatible", + "openrouter_image_gen": "OpenRouterImageGen", "oryx": "Oryx", "phi3v": "Phi3v", "phi4_multimodal": "Phi4", diff --git a/lmms_eval/models/simple/audio_flamingo_3.py b/lmms_eval/models/simple/audio_flamingo_3.py index b2b5a51cd..d1d030673 100644 --- a/lmms_eval/models/simple/audio_flamingo_3.py +++ b/lmms_eval/models/simple/audio_flamingo_3.py @@ -5,10 +5,10 @@ import numpy as np import soundfile as sf import torch +import transformers from accelerate import Accelerator, DistributedType from loguru import logger as eval_logger from tqdm import tqdm -import transformers from transformers import AutoProcessor try: @@ -53,11 +53,7 @@ def __init__( self.device_map = f"cuda:{accelerator.local_process_index}" if AudioFlamingo3ForConditionalGeneration is None: - raise ImportError( - "AudioFlamingo3ForConditionalGeneration is not available in transformers " - f"{transformers.__version__}. Please upgrade transformers/accelerate in this env, e.g. " - "`pip install -U transformers accelerate`." - ) + raise ImportError("AudioFlamingo3ForConditionalGeneration is not available in transformers " f"{transformers.__version__}. Please upgrade transformers/accelerate in this env, e.g. " "`pip install -U transformers accelerate`.") self._model = AudioFlamingo3ForConditionalGeneration.from_pretrained( pretrained, diff --git a/lmms_eval/models/simple/openrouter_image_gen.py b/lmms_eval/models/simple/openrouter_image_gen.py new file mode 100644 index 000000000..48d0ec157 --- /dev/null +++ b/lmms_eval/models/simple/openrouter_image_gen.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import base64 +import json +import os +import time +from pathlib import Path +from typing import Any, Optional + +import requests as http_requests +from PIL import Image + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + + +@register_model("openrouter_image_gen") +class OpenRouterImageGen(lmms): + is_simple = True + + def __init__( + self, + model_version: str = "openai/gpt-5-image-mini", + output_dir: str = "./logs/openrouter_image_gen", + max_new_tokens: int = 1024, + temperature: Optional[float] = None, + image_size: str = "1024x1024", + max_retries: int = 3, + timeout: int = 180, + **_: Any, + ) -> None: + super().__init__() + self.model_version = model_version + self.output_dir = output_dir + self.max_new_tokens = max_new_tokens + self.temperature = None if temperature is None else float(temperature) + self.image_size = image_size + self.max_retries = max_retries + self.timeout = timeout + + self.api_key = os.getenv("OPENROUTER_API_KEY") + if not self.api_key: + raise EnvironmentError("OPENROUTER_API_KEY is required for openrouter_image_gen") + + self.base_url = "https://openrouter.ai/api/v1/chat/completions" + self.session = http_requests.Session() + self.session.headers.update( + { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + } + ) + + Path(self.output_dir).mkdir(parents=True, exist_ok=True) + + def _encode_image(self, image: Image.Image) -> str: + from io import BytesIO + + buf = BytesIO() + image.convert("RGB").save(buf, format="PNG") + return base64.b64encode(buf.getvalue()).decode("utf-8") + + def _decode_data_url(self, data_url: str) -> bytes: + marker = "base64," + idx = data_url.find(marker) + if idx == -1: + raise ValueError("Image data URL missing base64 payload") + payload = data_url[idx + len(marker) :] + return base64.b64decode(payload) + + def _extract_images(self, payload: dict[str, Any]) -> list[str]: + out: list[str] = [] + try: + images = payload["choices"][0]["message"].get("images", []) + except (KeyError, IndexError, TypeError): + return out + + for item in images: + if not isinstance(item, dict): + continue + image_url = item.get("image_url", {}) + if not isinstance(image_url, dict): + continue + url = image_url.get("url") + if isinstance(url, str) and url.startswith("data:image"): + out.append(url) + return out + + def _request_generation(self, prompt: str, visuals: list[Image.Image]) -> dict[str, Any]: + content: list[dict[str, Any]] = [{"type": "text", "text": prompt}] + for img in visuals: + b64 = self._encode_image(img) + content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}) + + payload: dict[str, Any] = { + "model": self.model_version, + "messages": [{"role": "user", "content": content}], + "modalities": ["text", "image"], + "image": {"size": self.image_size}, + "max_tokens": self.max_new_tokens, + } + if self.temperature is not None: + payload["temperature"] = self.temperature + + for attempt in range(1, self.max_retries + 1): + try: + resp = self.session.post(self.base_url, json=payload, timeout=self.timeout) + resp.raise_for_status() + return resp.json() + except http_requests.HTTPError as exc: + detail = "" + if exc.response is not None: + detail = exc.response.text + if attempt == self.max_retries: + raise RuntimeError(f"OpenRouter HTTPError: {detail}") from exc + time.sleep(min(2 * attempt, 8)) + except Exception: + if attempt == self.max_retries: + raise + time.sleep(min(2 * attempt, 8)) + raise RuntimeError("Unreachable retry loop") + + def _save_images(self, image_data_urls: list[str], task: str, doc_id: int) -> list[str]: + task_dir = Path(self.output_dir) / str(task).replace("/", "_") + task_dir.mkdir(parents=True, exist_ok=True) + + saved_paths: list[str] = [] + for idx, data_url in enumerate(image_data_urls): + raw = self._decode_data_url(data_url) + path = task_dir / f"{doc_id}_{idx}.png" + path.write_bytes(raw) + saved_paths.append(str(path)) + return saved_paths + + def generate_until(self, requests: list[Instance]) -> list[str]: + outputs: list[str] = [] + for req in requests: + args = req.args + if len(args) < 6: + outputs.append(json.dumps({"text": "", "images": []}, ensure_ascii=False)) + continue + ctx, gen_kwargs, doc_to_visual, doc_id, task, split = args[:6] + prompt = str(ctx) + local_gen_kwargs = dict(gen_kwargs or {}) + + visuals_raw = doc_to_visual(self.task_dict[task][split][doc_id]) + visuals: list[Image.Image] = [] + for item in visuals_raw: + if isinstance(item, Image.Image): + visuals.append(item) + + if "max_new_tokens" in local_gen_kwargs: + self.max_new_tokens = int(local_gen_kwargs["max_new_tokens"]) + if "temperature" in local_gen_kwargs: + value = local_gen_kwargs["temperature"] + self.temperature = None if value is None else float(value) + + try: + data = self._request_generation(prompt=prompt, visuals=visuals) + except Exception: + data = self._request_generation(prompt=prompt, visuals=[]) + image_urls = self._extract_images(data) + saved_images = self._save_images(image_urls, task=str(task), doc_id=int(doc_id)) + + text = "" + try: + text = data["choices"][0]["message"].get("content", "") + except (KeyError, IndexError, TypeError): + text = "" + + result = {"text": text, "images": saved_images} + outputs.append(json.dumps(result, ensure_ascii=False)) + self.cache_hook.add_partial("generate_until", (ctx, local_gen_kwargs), outputs[-1]) + + return outputs + + def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: + raise NotImplementedError("openrouter_image_gen does not support loglikelihood") + + def generate_until_multi_round(self, requests: list[Instance]) -> list[str]: + raise NotImplementedError("openrouter_image_gen does not support multi-round generation") diff --git a/lmms_eval/tasks/ami/utils.py b/lmms_eval/tasks/ami/utils.py index 778d0c3e5..2a07e87eb 100644 --- a/lmms_eval/tasks/ami/utils.py +++ b/lmms_eval/tasks/ami/utils.py @@ -1,8 +1,10 @@ import os import re import string + import numpy as np from loguru import logger as eval_logger + from lmms_eval.llm_judge import ServerConfig, get_server API_TYPE = os.getenv("API_TYPE", "openai") @@ -36,23 +38,23 @@ def remove_punctuation_except_apostrophe(text): def ami_doc_to_audio(doc): """Extract audio from AMI dataset document - + AMI dataset uses AudioDecoder type with get_all_samples() method. Returns audio array and sampling rate (16kHz for AMI). """ audio_file = doc.get("audio") - + if not audio_file: eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}") return [] - + try: # AMI uses AudioDecoder type with get_all_samples() method if hasattr(audio_file, "get_all_samples"): decoded_audio = audio_file.get_all_samples() else: decoded_audio = audio_file - + # Extract array - check for data attribute first (AudioSamples object) if hasattr(decoded_audio, "data"): # AudioSamples object from torchcodec @@ -68,13 +70,13 @@ def ami_doc_to_audio(doc): audio_array = decoded_audio.array else: audio_array = decoded_audio - + # Convert torch tensor to numpy if needed if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"): audio_array = audio_array.cpu().numpy() elif hasattr(audio_array, "numpy"): audio_array = audio_array.numpy() - + # Ensure it's a numpy array and flatten if needed if not isinstance(audio_array, np.ndarray): try: @@ -86,27 +88,28 @@ def ami_doc_to_audio(doc): audio_array = np.array(audio_array.tolist()) else: raise - + # Ensure it's 1D array (flatten if multi-channel) if audio_array.ndim > 1: audio_array = audio_array.flatten() - + # Ensure float32 dtype for librosa compatibility if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) - + # Get sampling rate (AMI is 16kHz) sampling_rate = getattr(audio_file, "_desired_sample_rate", 16000) - + eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}") - + return [{"array": audio_array, "sampling_rate": sampling_rate}] - + except Exception as e: eval_logger.error(f"Error extracting audio: {e}") eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}") # Re-raise to help debug import traceback + eval_logger.error(f"Traceback: {traceback.format_exc()}") return [] @@ -115,14 +118,14 @@ def ami_doc_to_text(doc, lmms_eval_specific_kwargs): """Generate prompt for the audio model""" pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + # Get meeting context if needed meeting_id = get_column_value(doc, ["meeting_id"]) speaker_id = get_column_value(doc, ["speaker_id"]) - + # Default prompt for speech recognition default_prompt = "Please transcribe the following audio. Only provide the transcription without any additional explanation or formatting." - + return f"{pre_prompt}{default_prompt}{post_prompt}" @@ -132,35 +135,35 @@ def ami_process_results_asr(doc, results): Calculates Word Error Rate (WER) - case insensitive. """ scores = [] - + # Get ground truth ground_truth = get_column_value(doc, ["text", "transcript", "transcription"]) if not ground_truth: eval_logger.warning("No ground truth text found in document") return {"wer": 1.0} - + # Normalize: strip and lowercase for case-insensitive comparison ground_truth = ground_truth.strip().lower() - + # Remove all punctuation except apostrophe ground_truth = remove_punctuation_except_apostrophe(ground_truth) - + for pred in results: prediction = pred.strip() if isinstance(pred, str) else str(pred) - + # Extract transcription from various formats prediction = extract_transcription(prediction) - + # Normalize: strip and lowercase for case-insensitive comparison prediction = prediction.strip().lower() - + # Remove all punctuation except apostrophe prediction = remove_punctuation_except_apostrophe(prediction) - + # Calculate Word Error Rate wer = calculate_wer(ground_truth, prediction) scores.append(wer) - + avg_wer = sum(scores) / len(scores) if scores else 1.0 return {"wer": avg_wer} @@ -172,9 +175,9 @@ def extract_transcription(text): """ if not isinstance(text, str): return str(text) - + text = text.strip() - + # Pattern 1: XML-style tags for tag in ["", "", "", "", ""]: closing_tag = tag.replace("<", " 0 else 0.0 eval_logger.info(f"AMI evaluation: {correct_count}/{total_count} correct, accuracy: {accuracy:.4f}") - + return accuracy diff --git a/lmms_eval/tasks/cn_college_listen_mcq/utils.py b/lmms_eval/tasks/cn_college_listen_mcq/utils.py index 758c90b63..508eb43d5 100644 --- a/lmms_eval/tasks/cn_college_listen_mcq/utils.py +++ b/lmms_eval/tasks/cn_college_listen_mcq/utils.py @@ -1,8 +1,10 @@ import os import re + import numpy as np from loguru import logger as eval_logger + def get_column_value(doc, candidates): """Helper function to get value from document with multiple possible column names""" for candidate in candidates: @@ -13,22 +15,22 @@ def get_column_value(doc, candidates): def cn_college_mcq_doc_to_audio(doc): """Extract audio from CN College MCQ dataset - + Dataset uses 'context' field for audio. """ audio_file = doc.get("context") - + if not audio_file: eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}") return [] - + try: # Handle AudioDecoder type (like AMI dataset) if str(type(audio_file).__name__) == "AudioDecoder": decoded_audio = audio_file.get_all_samples() - + eval_logger.debug(f"decoded_audio type: {type(decoded_audio)}, type name: {type(decoded_audio).__name__}") - + # Extract array from AudioSamples or similar objects if hasattr(decoded_audio, "data"): audio_array = decoded_audio.data @@ -42,9 +44,9 @@ def cn_college_mcq_doc_to_audio(doc): else: audio_array = decoded_audio eval_logger.debug("Using decoded_audio directly") - + eval_logger.debug(f"audio_array type before conversion: {type(audio_array)}") - + # Convert torch tensor to numpy if needed if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"): audio_array = audio_array.cpu().numpy() @@ -55,7 +57,7 @@ def cn_college_mcq_doc_to_audio(doc): elif hasattr(audio_array, "detach"): audio_array = audio_array.detach().cpu().numpy() eval_logger.debug("Converted from torch tensor (detach)") - + # Ensure it's numpy array if not isinstance(audio_array, np.ndarray): try: @@ -65,17 +67,17 @@ def cn_college_mcq_doc_to_audio(doc): eval_logger.error(f"Failed to convert to numpy array: {e}") eval_logger.error(f"audio_array type: {type(audio_array)}, value: {audio_array}") return [] - + # Flatten if multi-dimensional if audio_array.ndim > 1: audio_array = audio_array.flatten() eval_logger.debug(f"Flattened to shape: {audio_array.shape}") - + # Ensure float32 dtype if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) eval_logger.debug(f"Converted to float32, dtype: {audio_array.dtype}") - + # Get sampling rate sampling_rate = 16000 # default if hasattr(decoded_audio, "sample_rate"): @@ -84,23 +86,23 @@ def cn_college_mcq_doc_to_audio(doc): sampling_rate = decoded_audio.sampling_rate elif hasattr(audio_file, "_desired_sample_rate"): sampling_rate = audio_file._desired_sample_rate - + eval_logger.debug(f"Final audio shape: {audio_array.shape}, sampling_rate: {sampling_rate}") - + return [{"array": audio_array, "sampling_rate": sampling_rate}] - + # Handle dict-like audio (standard HF format) elif isinstance(audio_file, dict): if "array" in audio_file and "sampling_rate" in audio_file: return [audio_file] - + # Handle direct array elif isinstance(audio_file, (list, np.ndarray)): return [{"array": np.array(audio_file, dtype=np.float32), "sampling_rate": 16000}] - + eval_logger.warning(f"Unknown audio type: {type(audio_file)}") return [] - + except Exception as e: eval_logger.error(f"Error extracting audio: {e}") eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}") @@ -111,14 +113,14 @@ def cn_college_mcq_doc_to_text(doc, lmms_eval_specific_kwargs): """Generate prompt for the audio model""" pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + # Get question and choices instruction = doc.get("instruction", "") choices = doc.get("choices", "") - + # Build prompt prompt = f"{instruction}\n\n{choices}" - + return f"{pre_prompt}{prompt}{post_prompt}" @@ -127,19 +129,20 @@ def cn_college_mcq_process_results(doc, results): Process results for Chinese College Listening MCQ task. Extract the predicted answer and compare with ground truth. """ + def normalize(text): """Normalize text for comparison""" if not isinstance(text, str): text = str(text) return text.lower().strip() - + def extract_answer(response): """Extract answer from model response""" if not response: return None - + response = normalize(response) - + patterns = [ r"answer\s+is\s+([ABCD])", r"answer:\s*([ABCD])", @@ -152,55 +155,52 @@ def extract_answer(response): r"^([ABCD])[\.,。]", r"^([ABCD])$", ] - + for pattern in patterns: match = re.search(pattern, response, re.IGNORECASE) if match: return match.group(1).upper() - + # Check if response starts with A/B/C/D for choice in ["A", "B", "C", "D"]: if response.startswith(choice.lower()): return choice - + return None - + # Get ground truth answer and extract the option letter ground_truth_raw = doc.get("answer", "").strip() # Extract option letter from ground truth (e.g., "(A) Find a place." -> "A") ground_truth = extract_answer(ground_truth_raw) - + # If extraction failed, try direct match if not ground_truth: ground_truth = ground_truth_raw.upper() - + # Extract predicted answer from first result pred = results[0] if results else "" predicted_answer = extract_answer(pred) - + # Calculate accuracy correct = 1 if predicted_answer and predicted_answer == ground_truth else 0 - + # Calculate failure rate (unable to extract valid answer) failure = 1 if predicted_answer is None else 0 - + eval_logger.debug(f"Ground truth raw: {ground_truth_raw}, extracted: {ground_truth}, Predicted: {predicted_answer}, Correct: {correct}") - - return { - "accuracy": correct, - "failure_rate": failure - } + + return {"accuracy": correct, "failure_rate": failure} def cn_college_mcq_aggregate_results(results): """Aggregate results across all samples""" if not results: return 0.0 - + total_count = len(results) correct_count = sum(results) accuracy = correct_count / total_count if total_count > 0 else 0.0 - + eval_logger.info(f"CN College MCQ evaluation: {correct_count}/{total_count} correct, accuracy: {accuracy:.4f}") - + return accuracy diff --git a/lmms_eval/tasks/dream_tts_mcq/utils.py b/lmms_eval/tasks/dream_tts_mcq/utils.py index dfeae3abe..7b267aaae 100644 --- a/lmms_eval/tasks/dream_tts_mcq/utils.py +++ b/lmms_eval/tasks/dream_tts_mcq/utils.py @@ -1,8 +1,10 @@ import os import re + import numpy as np from loguru import logger as eval_logger + def get_column_value(doc, candidates): """Helper function to get value from document with multiple possible column names""" for candidate in candidates: @@ -13,22 +15,22 @@ def get_column_value(doc, candidates): def dream_tts_mcq_doc_to_audio(doc): """Extract audio from DREAM TTS MCQ dataset - + Dataset uses 'context' field for audio. """ audio_file = doc.get("context") - + if not audio_file: eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}") return [] - + try: # Handle AudioDecoder type (like AMI dataset) if str(type(audio_file).__name__) == "AudioDecoder": decoded_audio = audio_file.get_all_samples() - + eval_logger.debug(f"decoded_audio type: {type(decoded_audio)}, type name: {type(decoded_audio).__name__}") - + # Extract array from AudioSamples or similar objects if hasattr(decoded_audio, "data"): audio_array = decoded_audio.data @@ -42,9 +44,9 @@ def dream_tts_mcq_doc_to_audio(doc): else: audio_array = decoded_audio eval_logger.debug("Using decoded_audio directly") - + eval_logger.debug(f"audio_array type before conversion: {type(audio_array)}") - + # Convert torch tensor to numpy if needed if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"): audio_array = audio_array.cpu().numpy() @@ -55,7 +57,7 @@ def dream_tts_mcq_doc_to_audio(doc): elif hasattr(audio_array, "detach"): audio_array = audio_array.detach().cpu().numpy() eval_logger.debug("Converted from torch tensor (detach)") - + # Ensure it's numpy array if not isinstance(audio_array, np.ndarray): try: @@ -65,17 +67,17 @@ def dream_tts_mcq_doc_to_audio(doc): eval_logger.error(f"Failed to convert to numpy array: {e}") eval_logger.error(f"audio_array type: {type(audio_array)}, value: {audio_array}") return [] - + # Flatten if multi-dimensional if audio_array.ndim > 1: audio_array = audio_array.flatten() eval_logger.debug(f"Flattened to shape: {audio_array.shape}") - + # Ensure float32 dtype if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) eval_logger.debug(f"Converted to float32, dtype: {audio_array.dtype}") - + # Get sampling rate sampling_rate = 16000 # default if hasattr(decoded_audio, "sample_rate"): @@ -84,23 +86,23 @@ def dream_tts_mcq_doc_to_audio(doc): sampling_rate = decoded_audio.sampling_rate elif hasattr(audio_file, "_desired_sample_rate"): sampling_rate = audio_file._desired_sample_rate - + eval_logger.debug(f"Final audio shape: {audio_array.shape}, sampling_rate: {sampling_rate}") - + return [{"array": audio_array, "sampling_rate": sampling_rate}] - + # Handle dict-like audio (standard HF format) elif isinstance(audio_file, dict): if "array" in audio_file and "sampling_rate" in audio_file: return [audio_file] - + # Handle direct array elif isinstance(audio_file, (list, np.ndarray)): return [{"array": np.array(audio_file, dtype=np.float32), "sampling_rate": 16000}] - + eval_logger.warning(f"Unknown audio type: {type(audio_file)}") return [] - + except Exception as e: eval_logger.error(f"Error extracting audio: {e}") eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}") @@ -111,14 +113,14 @@ def dream_tts_mcq_doc_to_text(doc, lmms_eval_specific_kwargs): """Generate prompt for the audio model""" pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + # Get question and choices instruction = doc.get("instruction", "") choices = doc.get("choices", "") - + # Build prompt prompt = f"{instruction}\n\n{choices}" - + return f"{pre_prompt}{prompt}{post_prompt}" @@ -127,19 +129,20 @@ def dream_tts_mcq_process_results(doc, results): Process results for DREAM TTS MCQ task. Extract the predicted answer and compare with ground truth. """ + def normalize(text): """Normalize text for comparison""" if not isinstance(text, str): text = str(text) return text.lower().strip() - + def extract_answer(response): """Extract answer from model response""" if not response: return None - + response = normalize(response) - + patterns = [ r"answer\s+is\s+([ABCD])", r"answer:\s*([ABCD])", @@ -152,55 +155,52 @@ def extract_answer(response): r"^([ABCD])[\.,。]", r"^([ABCD])$", ] - + for pattern in patterns: match = re.search(pattern, response, re.IGNORECASE) if match: return match.group(1).upper() - + # Check if response starts with A/B/C/D for choice in ["A", "B", "C", "D"]: if response.startswith(choice.lower()): return choice - + return None - + # Get ground truth answer and extract the option letter ground_truth_raw = doc.get("answer", "").strip() # Extract option letter from ground truth (e.g., "(A) Find a place." -> "A") ground_truth = extract_answer(ground_truth_raw) - + # If extraction failed, try direct match if not ground_truth: ground_truth = ground_truth_raw.upper() - + # Extract predicted answer from first result pred = results[0] if results else "" predicted_answer = extract_answer(pred) - + # Calculate accuracy correct = 1 if predicted_answer and predicted_answer == ground_truth else 0 - + # Calculate failure rate (unable to extract valid answer) failure = 1 if predicted_answer is None else 0 - + eval_logger.debug(f"Ground truth raw: {ground_truth_raw}, extracted: {ground_truth}, Predicted: {predicted_answer}, Correct: {correct}") - - return { - "accuracy": correct, - "failure_rate": failure - } + + return {"accuracy": correct, "failure_rate": failure} def dream_tts_mcq_aggregate_results(results): """Aggregate results across all samples""" if not results: return 0.0 - + total_count = len(results) correct_count = sum(results) accuracy = correct_count / total_count if total_count > 0 else 0.0 - + eval_logger.info(f"DREAM TTS MCQ evaluation: {correct_count}/{total_count} correct, accuracy: {accuracy:.4f}") - + return accuracy diff --git a/lmms_eval/tasks/europal_asr/utils.py b/lmms_eval/tasks/europal_asr/utils.py index 8fab52eb2..b8d74607e 100644 --- a/lmms_eval/tasks/europal_asr/utils.py +++ b/lmms_eval/tasks/europal_asr/utils.py @@ -1,5 +1,6 @@ import os import re + import numpy as np from loguru import logger as eval_logger @@ -14,22 +15,22 @@ def get_column_value(doc, candidates): def europal_asr_doc_to_audio(doc): """Extract audio from europal-asr dataset document - + Returns audio array and sampling rate (16kHz for europal-asr). """ audio_file = doc.get("audio") - + if not audio_file: eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}") return [] - + try: # Handle AudioDecoder type with get_all_samples() method if hasattr(audio_file, "get_all_samples"): decoded_audio = audio_file.get_all_samples() else: decoded_audio = audio_file - + # Extract array - check for data attribute first (AudioSamples object) if hasattr(decoded_audio, "data"): # AudioSamples object from torchcodec @@ -45,13 +46,13 @@ def europal_asr_doc_to_audio(doc): audio_array = decoded_audio.array else: audio_array = decoded_audio - + # Convert torch tensor to numpy if needed if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"): audio_array = audio_array.cpu().numpy() elif hasattr(audio_array, "numpy"): audio_array = audio_array.numpy() - + # Ensure it's a numpy array and flatten if needed if not isinstance(audio_array, np.ndarray): try: @@ -63,22 +64,22 @@ def europal_asr_doc_to_audio(doc): audio_array = np.array(audio_array.tolist()) else: raise - + # Ensure it's 1D array (flatten if multi-channel) if audio_array.ndim > 1: audio_array = audio_array.flatten() - + # Ensure float32 dtype for librosa compatibility if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) - + # Get sampling rate (europal-asr is 16kHz) sampling_rate = getattr(audio_file, "_desired_sample_rate", 16000) - + eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}") - + return [{"array": audio_array, "sampling_rate": sampling_rate}] - + except Exception as e: eval_logger.error(f"Error extracting audio: {e}") eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}") @@ -89,10 +90,10 @@ def europal_asr_doc_to_text(doc, lmms_eval_specific_kwargs): """Generate prompt for the audio model""" pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + # Default prompt for speech recognition default_prompt = "Please transcribe the following audio. Only provide the transcription without any additional explanation or formatting." - + return f"{pre_prompt}{default_prompt}{post_prompt}" @@ -102,27 +103,27 @@ def europal_asr_process_results_asr(doc, results): Calculates Word Error Rate (WER) using a simple implementation. """ scores = [] - + # Get ground truth ground_truth = get_column_value(doc, ["text_verbatim", "transcript", "transcription"]) if not ground_truth: eval_logger.warning("No ground truth text found in document") return {"wer": 1.0} - + ground_truth = ground_truth.strip().upper() - + for pred in results: prediction = pred.strip() if isinstance(pred, str) else str(pred) prediction = prediction.strip() - + # Extract transcription from various formats prediction = extract_transcription(prediction) prediction = prediction.upper() - + # Calculate Word Error Rate wer = calculate_wer(ground_truth, prediction) scores.append(wer) - + avg_wer = sum(scores) / len(scores) if scores else 1.0 return {"wer": avg_wer} @@ -134,9 +135,9 @@ def extract_transcription(text): """ if not isinstance(text, str): return str(text) - + text = text.strip() - + # Pattern 1: XML-style tags for tag in ["", "", "", "", ""]: closing_tag = tag.replace("<", " list[Image.Image]: + src = doc.get("source_image", "") + if isinstance(src, str) and src and os.path.exists(src): + return [Image.open(src).convert("RGB")] + return [] + + +def ice_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: dict[str, Any] | None = None) -> str: + instruction = str(doc.get("instruction", "")).strip() + if lmms_eval_specific_kwargs: + pre_prompt = str(lmms_eval_specific_kwargs.get("pre_prompt", "")) + post_prompt = str(lmms_eval_specific_kwargs.get("post_prompt", "")) + return f"{pre_prompt}{instruction}{post_prompt}" + return instruction + + +def ice_doc_to_target(doc: dict[str, Any]) -> str: + return str(doc.get("instruction", "")) + + +def ice_process_results(doc: dict[str, Any], results: list[str]) -> dict[str, float]: + if not results: + return {"artifact_saved": 0.0} + + raw = results[0] + try: + parsed = json.loads(raw) + except (json.JSONDecodeError, TypeError): + return {"artifact_saved": 0.0} + + images = parsed.get("images", []) if isinstance(parsed, dict) else [] + if not isinstance(images, list) or not images: + return {"artifact_saved": 0.0} + + first = images[0] + if isinstance(first, str) and os.path.exists(first): + return {"artifact_saved": 1.0} + return {"artifact_saved": 0.0} diff --git a/lmms_eval/tasks/song_describer/utils.py b/lmms_eval/tasks/song_describer/utils.py index 4ac206141..41726827a 100644 --- a/lmms_eval/tasks/song_describer/utils.py +++ b/lmms_eval/tasks/song_describer/utils.py @@ -20,22 +20,22 @@ def get_column_value(doc, candidates): def song_describer_doc_to_audio(doc): """Extract audio from song-describer dataset document - + Returns audio array and sampling rate (16kHz for song-describer). """ audio_file = doc.get("audio_path") or doc.get("audio") - + if not audio_file: eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}") return [] - + try: # Handle different audio formats if hasattr(audio_file, "get_all_samples"): decoded_audio = audio_file.get_all_samples() else: decoded_audio = audio_file - + # Extract array if hasattr(decoded_audio, "data"): audio_array = decoded_audio.data @@ -49,13 +49,13 @@ def song_describer_doc_to_audio(doc): audio_array = temp else: audio_array = decoded_audio - + # Convert torch tensor to numpy if needed if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"): audio_array = audio_array.cpu().numpy() elif hasattr(audio_array, "numpy"): audio_array = audio_array.numpy() - + # Ensure it's a numpy array if not isinstance(audio_array, np.ndarray): try: @@ -66,22 +66,22 @@ def song_describer_doc_to_audio(doc): audio_array = np.array(audio_array.tolist()) else: raise - + # Ensure it's 1D array (flatten if multi-channel) if audio_array.ndim > 1: audio_array = audio_array.flatten() - + # Ensure float32 dtype if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) - + # Get sampling rate (song-describer is 16kHz) sampling_rate = getattr(audio_file, "_desired_sample_rate", 16000) - + eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}") - + return [{"array": audio_array, "sampling_rate": sampling_rate}] - + except Exception as e: eval_logger.error(f"Error extracting audio: {e}") eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}") @@ -92,10 +92,10 @@ def song_describer_doc_to_text(doc, lmms_eval_specific_kwargs): """Generate prompt for music captioning""" pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + # Default prompt for music captioning default_prompt = "Listen to this music and describe what you hear. Include details about the instruments, genre, mood, and any distinctive musical elements." - + return f"{pre_prompt}{default_prompt}{post_prompt}" @@ -122,20 +122,16 @@ def song_describer_doc_to_text(doc, lmms_eval_specific_kwargs): def get_eval_model(): """Lazy load evaluation model""" global _eval_model, _eval_tokenizer - + if _eval_model is None: eval_logger.info(f"Loading evaluation model: {EVAL_MODEL_NAME}") _eval_tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL_NAME, trust_remote_code=True) - _eval_model = AutoModelForCausalLM.from_pretrained( - EVAL_MODEL_NAME, - torch_dtype=torch.bfloat16, - device_map="auto", - trust_remote_code=True - ).eval() + _eval_model = AutoModelForCausalLM.from_pretrained(EVAL_MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True).eval() eval_logger.info("Evaluation model loaded successfully") - + return _eval_model, _eval_tokenizer + eval_prompt = """ [Music Description Task] Reference Description: {ground_truth} @@ -169,23 +165,19 @@ def get_eval_model(): def get_eval(max_tokens: int, content: str): """Call local Qwen model for evaluation""" model, tokenizer = get_eval_model() - + messages = [ {"role": "system", "content": "You are a professional music critic and evaluator. Provide objective and detailed assessments."}, {"role": "user", "content": content}, ] - + try: # Format messages using chat template - text = tokenizer.apply_chat_template( - messages, - tokenize=False, - add_generation_prompt=True - ) - + text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + # Tokenize and generate model_inputs = tokenizer([text], return_tensors="pt").to(model.device) - + with torch.no_grad(): generated_ids = model.generate( **model_inputs, @@ -194,20 +186,18 @@ def get_eval(max_tokens: int, content: str): do_sample=True, top_p=0.9, ) - + # Decode only the generated part (excluding input) - generated_ids = [ - output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) - ] - + generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] + response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() - + if response: return response, EVAL_MODEL_NAME - + eval_logger.warning("Empty response from evaluation model") return "", "" - + except Exception as e: eval_logger.error(f"Error during evaluation: {e}") return "", "" @@ -217,18 +207,15 @@ def song_describer_process_results(doc, results): """Process results for song-describer captioning task""" pred = results[0] ground_truth_str = get_column_value(doc, ["text", "caption", "description"]) - + if not ground_truth_str: eval_logger.warning("No ground truth text found in document") return {"eval_score": {"eval_answer": "", "model_name": ""}} - - content = eval_prompt.format( - model_response=pred, - ground_truth=ground_truth_str - ) - + + content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str) + eval_answer, model_name = get_eval(max_tokens=1024, content=content) - + return { "eval_score": {"eval_answer": eval_answer, "model_name": model_name}, } @@ -238,10 +225,10 @@ def song_describer_aggregate_results(results): """Aggregate evaluation scores from local model""" score = 0 valid_count = 0 - + for result in results: eval_answer = result["eval_answer"] - + if not eval_answer: continue @@ -258,5 +245,5 @@ def song_describer_aggregate_results(results): if valid_count == 0: eval_logger.error("No valid evaluation scores found") return 0.0 - + return score / valid_count diff --git a/lmms_eval/tasks/voxpopuli/utils.py b/lmms_eval/tasks/voxpopuli/utils.py index b5ae95ba4..f7712bb36 100644 --- a/lmms_eval/tasks/voxpopuli/utils.py +++ b/lmms_eval/tasks/voxpopuli/utils.py @@ -1,15 +1,12 @@ import os import re -import numpy as np from collections import defaultdict + +import numpy as np from loguru import logger as eval_logger # Language code mapping (voxpopuli uses integer codes) -LANGUAGE_MAP = { - 0: "en", 1: "de", 2: "fr", 3: "es", 4: "pl", 5: "it", - 6: "ro", 7: "hu", 8: "cs", 9: "nl", 10: "fi", 11: "hr", - 12: "sk", 13: "sl", 14: "et", 15: "lt" -} +LANGUAGE_MAP = {0: "en", 1: "de", 2: "fr", 3: "es", 4: "pl", 5: "it", 6: "ro", 7: "hu", 8: "cs", 9: "nl", 10: "fi", 11: "hr", 12: "sk", 13: "sl", 14: "et", 15: "lt"} def _fallback_silent_audio(sampling_rate: int = 16000): @@ -26,7 +23,7 @@ def get_column_value(doc, candidates): def voxpopuli_doc_to_audio(doc): """Extract audio from VoxPopuli dataset document - + VoxPopuli dataset structure: { 'audio': { @@ -38,11 +35,11 @@ def voxpopuli_doc_to_audio(doc): } """ audio_data = doc.get("audio") - + if not audio_data: eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}") return _fallback_silent_audio() - + try: # VoxPopuli audio is already a dict with 'array' and 'sampling_rate' if isinstance(audio_data, dict): @@ -51,7 +48,7 @@ def voxpopuli_doc_to_audio(doc): else: # If it's an AudioDecoder object decoded_audio = audio_data.get_all_samples() if hasattr(audio_data, "get_all_samples") else audio_data - + # Extract array - check for data attribute first (AudioSamples object from torchcodec) if hasattr(decoded_audio, "data"): # AudioSamples object from torchcodec @@ -67,19 +64,19 @@ def voxpopuli_doc_to_audio(doc): audio_array = decoded_audio.array else: audio_array = decoded_audio - + sampling_rate = getattr(audio_data, "_desired_sample_rate", 16000) - + # Convert torch tensor to numpy if needed if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"): audio_array = audio_array.cpu().numpy() elif hasattr(audio_array, "numpy"): audio_array = audio_array.numpy() - + # Ensure it's a numpy array if not isinstance(audio_array, np.ndarray): audio_array = np.array(audio_array) - + # Ensure it's 1D array (flatten if multi-channel) if audio_array.ndim > 1: audio_array = audio_array.flatten() @@ -87,15 +84,15 @@ def voxpopuli_doc_to_audio(doc): if audio_array.size == 0: eval_logger.warning("Decoded audio is empty, using 1s silent fallback audio") return _fallback_silent_audio(int(sampling_rate) if sampling_rate else 16000) - + # Ensure float32 dtype if audio_array.dtype != np.float32: audio_array = audio_array.astype(np.float32) - + eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}") - + return [{"array": audio_array, "sampling_rate": sampling_rate}] - + except Exception as e: eval_logger.error(f"Error extracting audio: {e}") eval_logger.error(f"Audio type: {type(audio_data)}, attributes: {dir(audio_data)}") @@ -106,14 +103,14 @@ def voxpopuli_doc_to_text(doc, lmms_eval_specific_kwargs): """Generate prompt for the audio model""" pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") - + # Get language information language_code = doc.get("language", 0) language = LANGUAGE_MAP.get(language_code, "unknown") - + # Default prompt for speech recognition default_prompt = f"Please transcribe the following audio in {language}." - + return f"{pre_prompt}{default_prompt}{post_prompt}" @@ -123,38 +120,34 @@ def voxpopuli_process_results_asr(doc, results): Calculates Word Error Rate (WER) and stores language information for aggregation. """ scores = [] - + # Get ground truth ground_truth = get_column_value(doc, ["normalized_text", "text", "transcript"]) if not ground_truth: eval_logger.warning("No ground truth text found in document") return {"wer": 1.0, "language": doc.get("language", 0)} - + ground_truth = ground_truth.strip().lower() # VoxPopuli uses lowercase normalized text - + # Get language for later aggregation language_code = doc.get("language", 0) - + for pred in results: prediction = pred.strip() if isinstance(pred, str) else str(pred) prediction = prediction.strip() - + # Extract transcription from various formats prediction = extract_transcription(prediction) prediction = prediction.lower() - + # Calculate Word Error Rate wer = calculate_wer(ground_truth, prediction) scores.append(wer) - + avg_wer = sum(scores) / len(scores) if scores else 1.0 - + # Return both WER and language for aggregation - return { - "wer": avg_wer, - "language": language_code, - "wer_by_language": {language_code: avg_wer} - } + return {"wer": avg_wer, "language": language_code, "wer_by_language": {language_code: avg_wer}} def extract_transcription(text): @@ -164,9 +157,9 @@ def extract_transcription(text): """ if not isinstance(text, str): return str(text) - + text = text.strip() - + # Pattern 1: XML-style tags for tag in ["", "", "", "", ""]: closing_tag = tag.replace("<", "