diff --git a/examples/models/openrouter_ice_smoke.sh b/examples/models/openrouter_ice_smoke.sh
new file mode 100755
index 000000000..4ee6e0991
--- /dev/null
+++ b/examples/models/openrouter_ice_smoke.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+export OPENROUTER_API_KEY="${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}"
+
+MODEL_VERSION="${MODEL_VERSION:-google/gemini-2.5-flash-image}"
+TASKS="${TASKS:-ice_bench}"
+LIMIT="${LIMIT:-1}"
+OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_ice_smoke}"
+IMAGE_OUTPUT_DIR="${IMAGE_OUTPUT_DIR:-./logs/openrouter_ice_images}"
+USE_OFFICIAL_ICE_SAMPLE="${USE_OFFICIAL_ICE_SAMPLE:-1}"
+
+mkdir -p "${OUTPUT_PATH}" "${IMAGE_OUTPUT_DIR}"
+
+if [[ "${USE_OFFICIAL_ICE_SAMPLE}" == "1" ]]; then
+uv run python - <<'PY'
+import json
+import zipfile
+from pathlib import Path
+
+from huggingface_hub import hf_hub_download
+
+zip_path = hf_hub_download(
+    repo_id="ali-vilab/ICE-Bench",
+    repo_type="dataset",
+    filename="dataset.zip",
+    token=False,
+)
+
+target_jsonl = Path("/tmp/ice_bench_smoke.jsonl")
+target_dir = Path("/tmp/ice_bench_smoke_data")
+target_dir.mkdir(parents=True, exist_ok=True)
+
+with zipfile.ZipFile(zip_path) as zf:
+    with zf.open("data/data.jsonl") as fh:
+        first = json.loads(next(fh))
+
+    src_rel = first["SourceImage"]
+    instruction = first["Instruction"]
+    item_id = first["ItemID"]
+
+    src_out = target_dir / f"{item_id}_src.png"
+    with zf.open(src_rel) as src_in:
+        src_out.write_bytes(src_in.read())
+
+record = {
+    "item_id": item_id,
+    "instruction": instruction,
+    "source_image": str(src_out),
+}
+target_jsonl.write_text(json.dumps(record, ensure_ascii=False) + "\n", encoding="utf-8")
+print(f"Prepared smoke data at {target_jsonl}")
+print(f"Source image at {src_out}")
+PY
+fi
+
+echo "[INFO] Running ICE smoke with model=${MODEL_VERSION} tasks=${TASKS}"
+
+uv run python -m lmms_eval \
+  --model openrouter_image_gen \
+  --model_args "model_version=${MODEL_VERSION},output_dir=${IMAGE_OUTPUT_DIR},max_new_tokens=4096,image_size=1024x1024" \
+  --tasks "${TASKS}" \
+  --batch_size 1 \
+  --limit "${LIMIT}" \
+  --output_path "${OUTPUT_PATH}" \
+  --log_samples \
+  --verbosity INFO
+
+echo "[INFO] Done. Generated images in ${IMAGE_OUTPUT_DIR}/ice_bench"
diff --git a/examples/models/openrouter_image_smoke.sh b/examples/models/openrouter_image_smoke.sh
new file mode 100755
index 000000000..b0a178a55
--- /dev/null
+++ b/examples/models/openrouter_image_smoke.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+export OPENAI_API_KEY="${OPENAI_API_KEY:-${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}}"
+export OPENAI_API_BASE="${OPENAI_API_BASE:-https://openrouter.ai/api/v1}"
+
+MODEL_VERSION="${MODEL_VERSION:-google/gemini-2.5-flash-image}"
+TASKS="${TASKS:-ice_bench}"
+LIMIT="${LIMIT:-1}"
+OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_image_smoke}"
+IMAGE_OUTPUT_DIR="${IMAGE_OUTPUT_DIR:-./logs/openrouter_image_outputs}"
+
+echo "[INFO] OpenRouter image smoke"
+echo "[INFO] model=${MODEL_VERSION} tasks=${TASKS} limit=${LIMIT}"
+echo "[INFO] output_path=${OUTPUT_PATH} image_output_dir=${IMAGE_OUTPUT_DIR}"
+
+uv run python -m lmms_eval \
+  --model openrouter_image_gen \
+  --model_args "model_version=${MODEL_VERSION},output_dir=${IMAGE_OUTPUT_DIR},max_new_tokens=900,image_size=1024x1024" \
+  --tasks "${TASKS}" \
+  --batch_size 1 \
+  --limit "${LIMIT}" \
+  --output_path "${OUTPUT_PATH}" \
+  --log_samples \
+  --process_with_media \
+  --verbosity INFO
+
+echo "[INFO] Done. Generated images under: ${IMAGE_OUTPUT_DIR}"
diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py
index b46192725..a13d276a5 100644
--- a/lmms_eval/models/__init__.py
+++ b/lmms_eval/models/__init__.py
@@ -66,6 +66,7 @@
     "ola": "Ola",
     "omnivinci": "OmniVinci",
     "openai": "OpenAICompatible",
+    "openrouter_image_gen": "OpenRouterImageGen",
     "oryx": "Oryx",
     "phi3v": "Phi3v",
     "phi4_multimodal": "Phi4",
diff --git a/lmms_eval/models/simple/audio_flamingo_3.py b/lmms_eval/models/simple/audio_flamingo_3.py
index b2b5a51cd..d1d030673 100644
--- a/lmms_eval/models/simple/audio_flamingo_3.py
+++ b/lmms_eval/models/simple/audio_flamingo_3.py
@@ -5,10 +5,10 @@
 import numpy as np
 import soundfile as sf
 import torch
+import transformers
 from accelerate import Accelerator, DistributedType
 from loguru import logger as eval_logger
 from tqdm import tqdm
-import transformers
 from transformers import AutoProcessor
 
 try:
@@ -53,11 +53,7 @@ def __init__(
             self.device_map = f"cuda:{accelerator.local_process_index}"
 
         if AudioFlamingo3ForConditionalGeneration is None:
-            raise ImportError(
-                "AudioFlamingo3ForConditionalGeneration is not available in transformers "
-                f"{transformers.__version__}. Please upgrade transformers/accelerate in this env, e.g. "
-                "`pip install -U transformers accelerate`."
-            )
+            raise ImportError("AudioFlamingo3ForConditionalGeneration is not available in transformers " f"{transformers.__version__}. Please upgrade transformers/accelerate in this env, e.g. " "`pip install -U transformers accelerate`.")
 
         self._model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
             pretrained,
diff --git a/lmms_eval/models/simple/openrouter_image_gen.py b/lmms_eval/models/simple/openrouter_image_gen.py
new file mode 100644
index 000000000..48d0ec157
--- /dev/null
+++ b/lmms_eval/models/simple/openrouter_image_gen.py
@@ -0,0 +1,182 @@
+from __future__ import annotations
+
+import base64
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Optional
+
+import requests as http_requests
+from PIL import Image
+
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+
+
+@register_model("openrouter_image_gen")
+class OpenRouterImageGen(lmms):
+    is_simple = True
+
+    def __init__(
+        self,
+        model_version: str = "openai/gpt-5-image-mini",
+        output_dir: str = "./logs/openrouter_image_gen",
+        max_new_tokens: int = 1024,
+        temperature: Optional[float] = None,
+        image_size: str = "1024x1024",
+        max_retries: int = 3,
+        timeout: int = 180,
+        **_: Any,
+    ) -> None:
+        super().__init__()
+        self.model_version = model_version
+        self.output_dir = output_dir
+        self.max_new_tokens = max_new_tokens
+        self.temperature = None if temperature is None else float(temperature)
+        self.image_size = image_size
+        self.max_retries = max_retries
+        self.timeout = timeout
+
+        self.api_key = os.getenv("OPENROUTER_API_KEY")
+        if not self.api_key:
+            raise EnvironmentError("OPENROUTER_API_KEY is required for openrouter_image_gen")
+
+        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
+        self.session = http_requests.Session()
+        self.session.headers.update(
+            {
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            }
+        )
+
+        Path(self.output_dir).mkdir(parents=True, exist_ok=True)
+
+    def _encode_image(self, image: Image.Image) -> str:
+        from io import BytesIO
+
+        buf = BytesIO()
+        image.convert("RGB").save(buf, format="PNG")
+        return base64.b64encode(buf.getvalue()).decode("utf-8")
+
+    def _decode_data_url(self, data_url: str) -> bytes:
+        marker = "base64,"
+        idx = data_url.find(marker)
+        if idx == -1:
+            raise ValueError("Image data URL missing base64 payload")
+        payload = data_url[idx + len(marker) :]
+        return base64.b64decode(payload)
+
+    def _extract_images(self, payload: dict[str, Any]) -> list[str]:
+        out: list[str] = []
+        try:
+            images = payload["choices"][0]["message"].get("images", [])
+        except (KeyError, IndexError, TypeError):
+            return out
+
+        for item in images:
+            if not isinstance(item, dict):
+                continue
+            image_url = item.get("image_url", {})
+            if not isinstance(image_url, dict):
+                continue
+            url = image_url.get("url")
+            if isinstance(url, str) and url.startswith("data:image"):
+                out.append(url)
+        return out
+
+    def _request_generation(self, prompt: str, visuals: list[Image.Image]) -> dict[str, Any]:
+        content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
+        for img in visuals:
+            b64 = self._encode_image(img)
+            content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})
+
+        payload: dict[str, Any] = {
+            "model": self.model_version,
+            "messages": [{"role": "user", "content": content}],
+            "modalities": ["text", "image"],
+            "image": {"size": self.image_size},
+            "max_tokens": self.max_new_tokens,
+        }
+        if self.temperature is not None:
+            payload["temperature"] = self.temperature
+
+        for attempt in range(1, self.max_retries + 1):
+            try:
+                resp = self.session.post(self.base_url, json=payload, timeout=self.timeout)
+                resp.raise_for_status()
+                return resp.json()
+            except http_requests.HTTPError as exc:
+                detail = ""
+                if exc.response is not None:
+                    detail = exc.response.text
+                if attempt == self.max_retries:
+                    raise RuntimeError(f"OpenRouter HTTPError: {detail}") from exc
+                time.sleep(min(2 * attempt, 8))
+            except Exception:
+                if attempt == self.max_retries:
+                    raise
+                time.sleep(min(2 * attempt, 8))
+        raise RuntimeError("Unreachable retry loop")
+
+    def _save_images(self, image_data_urls: list[str], task: str, doc_id: int) -> list[str]:
+        task_dir = Path(self.output_dir) / str(task).replace("/", "_")
+        task_dir.mkdir(parents=True, exist_ok=True)
+
+        saved_paths: list[str] = []
+        for idx, data_url in enumerate(image_data_urls):
+            raw = self._decode_data_url(data_url)
+            path = task_dir / f"{doc_id}_{idx}.png"
+            path.write_bytes(raw)
+            saved_paths.append(str(path))
+        return saved_paths
+
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        outputs: list[str] = []
+        for req in requests:
+            args = req.args
+            if len(args) < 6:
+                outputs.append(json.dumps({"text": "", "images": []}, ensure_ascii=False))
+                continue
+            ctx, gen_kwargs, doc_to_visual, doc_id, task, split = args[:6]
+            prompt = str(ctx)
+            local_gen_kwargs = dict(gen_kwargs or {})
+
+            visuals_raw = doc_to_visual(self.task_dict[task][split][doc_id])
+            visuals: list[Image.Image] = []
+            for item in visuals_raw:
+                if isinstance(item, Image.Image):
+                    visuals.append(item)
+
+            if "max_new_tokens" in local_gen_kwargs:
+                self.max_new_tokens = int(local_gen_kwargs["max_new_tokens"])
+            if "temperature" in local_gen_kwargs:
+                value = local_gen_kwargs["temperature"]
+                self.temperature = None if value is None else float(value)
+
+            try:
+                data = self._request_generation(prompt=prompt, visuals=visuals)
+            except Exception:
+                data = self._request_generation(prompt=prompt, visuals=[])
+            image_urls = self._extract_images(data)
+            saved_images = self._save_images(image_urls, task=str(task), doc_id=int(doc_id))
+
+            text = ""
+            try:
+                text = data["choices"][0]["message"].get("content", "")
+            except (KeyError, IndexError, TypeError):
+                text = ""
+
+            result = {"text": text, "images": saved_images}
+            outputs.append(json.dumps(result, ensure_ascii=False))
+            self.cache_hook.add_partial("generate_until", (ctx, local_gen_kwargs), outputs[-1])
+
+        return outputs
+
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        raise NotImplementedError("openrouter_image_gen does not support loglikelihood")
+
+    def generate_until_multi_round(self, requests: list[Instance]) -> list[str]:
+        raise NotImplementedError("openrouter_image_gen does not support multi-round generation")
diff --git a/lmms_eval/tasks/ami/utils.py b/lmms_eval/tasks/ami/utils.py
index 778d0c3e5..2a07e87eb 100644
--- a/lmms_eval/tasks/ami/utils.py
+++ b/lmms_eval/tasks/ami/utils.py
@@ -1,8 +1,10 @@
 import os
 import re
 import string
+
 import numpy as np
 from loguru import logger as eval_logger
+
 from lmms_eval.llm_judge import ServerConfig, get_server
 
 API_TYPE = os.getenv("API_TYPE", "openai")
@@ -36,23 +38,23 @@ def remove_punctuation_except_apostrophe(text):
 
 def ami_doc_to_audio(doc):
     """Extract audio from AMI dataset document
-    
+
     AMI dataset uses AudioDecoder type with get_all_samples() method.
     Returns audio array and sampling rate (16kHz for AMI).
     """
     audio_file = doc.get("audio")
-    
+
     if not audio_file:
         eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}")
         return []
-    
+
     try:
         # AMI uses AudioDecoder type with get_all_samples() method
         if hasattr(audio_file, "get_all_samples"):
             decoded_audio = audio_file.get_all_samples()
         else:
             decoded_audio = audio_file
-        
+
         # Extract array - check for data attribute first (AudioSamples object)
         if hasattr(decoded_audio, "data"):
             # AudioSamples object from torchcodec
@@ -68,13 +70,13 @@ def ami_doc_to_audio(doc):
             audio_array = decoded_audio.array
         else:
             audio_array = decoded_audio
-        
+
         # Convert torch tensor to numpy if needed
         if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"):
             audio_array = audio_array.cpu().numpy()
         elif hasattr(audio_array, "numpy"):
             audio_array = audio_array.numpy()
-        
+
         # Ensure it's a numpy array and flatten if needed
         if not isinstance(audio_array, np.ndarray):
             try:
@@ -86,27 +88,28 @@ def ami_doc_to_audio(doc):
                     audio_array = np.array(audio_array.tolist())
                 else:
                     raise
-        
+
         # Ensure it's 1D array (flatten if multi-channel)
         if audio_array.ndim > 1:
             audio_array = audio_array.flatten()
-        
+
         # Ensure float32 dtype for librosa compatibility
         if audio_array.dtype != np.float32:
             audio_array = audio_array.astype(np.float32)
-        
+
         # Get sampling rate (AMI is 16kHz)
         sampling_rate = getattr(audio_file, "_desired_sample_rate", 16000)
-        
+
         eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}")
-        
+
         return [{"array": audio_array, "sampling_rate": sampling_rate}]
-        
+
     except Exception as e:
         eval_logger.error(f"Error extracting audio: {e}")
         eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}")
         # Re-raise to help debug
         import traceback
+
         eval_logger.error(f"Traceback: {traceback.format_exc()}")
         return []
 
@@ -115,14 +118,14 @@ def ami_doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate prompt for the audio model"""
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     # Get meeting context if needed
     meeting_id = get_column_value(doc, ["meeting_id"])
     speaker_id = get_column_value(doc, ["speaker_id"])
-    
+
     # Default prompt for speech recognition
     default_prompt = "Please transcribe the following audio. Only provide the transcription without any additional explanation or formatting."
-    
+
     return f"{pre_prompt}{default_prompt}{post_prompt}"
 
 
@@ -132,35 +135,35 @@ def ami_process_results_asr(doc, results):
     Calculates Word Error Rate (WER) - case insensitive.
     """
     scores = []
-    
+
     # Get ground truth
     ground_truth = get_column_value(doc, ["text", "transcript", "transcription"])
     if not ground_truth:
         eval_logger.warning("No ground truth text found in document")
         return {"wer": 1.0}
-    
+
     # Normalize: strip and lowercase for case-insensitive comparison
     ground_truth = ground_truth.strip().lower()
-    
+
     # Remove all punctuation except apostrophe
     ground_truth = remove_punctuation_except_apostrophe(ground_truth)
-    
+
     for pred in results:
         prediction = pred.strip() if isinstance(pred, str) else str(pred)
-        
+
         # Extract transcription from various formats
         prediction = extract_transcription(prediction)
-        
+
         # Normalize: strip and lowercase for case-insensitive comparison
         prediction = prediction.strip().lower()
-        
+
         # Remove all punctuation except apostrophe
         prediction = remove_punctuation_except_apostrophe(prediction)
-        
+
         # Calculate Word Error Rate
         wer = calculate_wer(ground_truth, prediction)
         scores.append(wer)
-    
+
     avg_wer = sum(scores) / len(scores) if scores else 1.0
     return {"wer": avg_wer}
 
@@ -172,9 +175,9 @@ def extract_transcription(text):
     """
     if not isinstance(text, str):
         return str(text)
-    
+
     text = text.strip()
-    
+
     # Pattern 1: XML-style tags
     for tag in ["<answer>", "<response>", "<result>", "<transcription>", "<text>"]:
         closing_tag = tag.replace("<", "</")
@@ -182,40 +185,37 @@ def extract_transcription(text):
         match = re.search(pattern, text, re.IGNORECASE)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 2: "The transcription of the audio is:" followed by text in quotes
     patterns = [
         r"(?:the\s+)?transcription\s+(?:of\s+)?(?:the\s+)?audio\s+is\s*:\s*['\"](.+?)['\"]\s*\.?\s*$",
         r"(?:the\s+)?original\s+content\s+(?:of\s+)?(?:this\s+)?audio\s+is\s*:\s*['\"](.+?)['\"]\s*\.?\s*$",
         r"(?:the\s+)?(?:audio|speech)\s+(?:content|transcription|says)\s*:\s*['\"](.+?)['\"]\s*\.?\s*$",
     ]
-    
+
     for pattern in patterns:
         match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 3: Text enclosed in quotes (single or double)
-    quote_patterns = [
-        r"^['\"](.+?)['\"]\s*\.?\s*$",  # Entire text in quotes
-        r"['\"]([^'\"]{20,})['\"]"  # Long text in quotes (at least 20 chars)
-    ]
-    
+    quote_patterns = [r"^['\"](.+?)['\"]\s*\.?\s*$", r"['\"]([^'\"]{20,})['\"]"]  # Entire text in quotes  # Long text in quotes (at least 20 chars)
+
     for pattern in quote_patterns:
         match = re.search(pattern, text, re.DOTALL)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 4: Remove common prefixes
     prefixes_to_remove = [
         r"^(?:here\s+is\s+)?(?:the\s+)?transcription\s*(?:of\s+(?:the\s+)?audio)?\s*:\s*",
         r"^(?:the\s+)?(?:audio|speech)\s+(?:says|contains)\s*:\s*",
         r"^(?:answer|response|result)\s*:\s*",
     ]
-    
+
     for prefix in prefixes_to_remove:
         text = re.sub(prefix, "", text, flags=re.IGNORECASE)
-    
+
     return text.strip()
 
 
@@ -232,28 +232,28 @@ def calculate_wer(reference, hypothesis):
     # Split into words
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    
+
     # Build edit distance matrix
     n, m = len(ref_words), len(hyp_words)
     dp = [[0] * (m + 1) for _ in range(n + 1)]
-    
+
     # Initialize
     for i in range(n + 1):
         dp[i][0] = i
     for j in range(m + 1):
         dp[0][j] = j
-    
+
     # Dynamic programming
     for i in range(1, n + 1):
         for j in range(1, m + 1):
-            if ref_words[i-1] == hyp_words[j-1]:
-                dp[i][j] = dp[i-1][j-1]
+            if ref_words[i - 1] == hyp_words[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
             else:
-                substitution = dp[i-1][j-1] + 1
-                insertion = dp[i][j-1] + 1
-                deletion = dp[i-1][j] + 1
+                substitution = dp[i - 1][j - 1] + 1
+                insertion = dp[i][j - 1] + 1
+                deletion = dp[i - 1][j] + 1
                 dp[i][j] = min(substitution, insertion, deletion)
-    
+
     # Calculate WER
     wer = dp[n][m] / max(n, 1)  # Avoid division by zero
     return wer
@@ -311,13 +311,7 @@ def ami_process_results_llm_judge(doc, results):
 
             custom_config = ServerConfig(model_name=JUDGE_MODEL_VERSION, temperature=0.5, max_tokens=10)
 
-            request = Request(
-                messages=[
-                    {"role": "system", "content": "You are a helpful assistant who evaluates speech recognition quality."},
-                    {"role": "user", "content": formatted_prompt}
-                ],
-                config=custom_config
-            )
+            request = Request(messages=[{"role": "system", "content": "You are a helpful assistant who evaluates speech recognition quality."}, {"role": "user", "content": formatted_prompt}], config=custom_config)
 
             response = server.evaluate(request)
 
@@ -349,16 +343,16 @@ def ami_aggregate_results(results):
         return 0.0
 
     total_count = len(results)
-    
+
     # If results are WER scores, return average
     if all(isinstance(r, (int, float)) for r in results):
         avg_score = sum(results) / total_count
         eval_logger.info(f"AMI evaluation: Average score: {avg_score:.4f}")
         return avg_score
-    
+
     # If results are boolean (correct/incorrect), calculate accuracy
     correct_count = sum(results)
     accuracy = correct_count / total_count if total_count > 0 else 0.0
     eval_logger.info(f"AMI evaluation: {correct_count}/{total_count} correct, accuracy: {accuracy:.4f}")
-    
+
     return accuracy
diff --git a/lmms_eval/tasks/cn_college_listen_mcq/utils.py b/lmms_eval/tasks/cn_college_listen_mcq/utils.py
index 758c90b63..508eb43d5 100644
--- a/lmms_eval/tasks/cn_college_listen_mcq/utils.py
+++ b/lmms_eval/tasks/cn_college_listen_mcq/utils.py
@@ -1,8 +1,10 @@
 import os
 import re
+
 import numpy as np
 from loguru import logger as eval_logger
 
+
 def get_column_value(doc, candidates):
     """Helper function to get value from document with multiple possible column names"""
     for candidate in candidates:
@@ -13,22 +15,22 @@ def get_column_value(doc, candidates):
 
 def cn_college_mcq_doc_to_audio(doc):
     """Extract audio from CN College MCQ dataset
-    
+
     Dataset uses 'context' field for audio.
     """
     audio_file = doc.get("context")
-    
+
     if not audio_file:
         eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}")
         return []
-    
+
     try:
         # Handle AudioDecoder type (like AMI dataset)
         if str(type(audio_file).__name__) == "AudioDecoder":
             decoded_audio = audio_file.get_all_samples()
-            
+
             eval_logger.debug(f"decoded_audio type: {type(decoded_audio)}, type name: {type(decoded_audio).__name__}")
-            
+
             # Extract array from AudioSamples or similar objects
             if hasattr(decoded_audio, "data"):
                 audio_array = decoded_audio.data
@@ -42,9 +44,9 @@ def cn_college_mcq_doc_to_audio(doc):
             else:
                 audio_array = decoded_audio
                 eval_logger.debug("Using decoded_audio directly")
-            
+
             eval_logger.debug(f"audio_array type before conversion: {type(audio_array)}")
-            
+
             # Convert torch tensor to numpy if needed
             if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"):
                 audio_array = audio_array.cpu().numpy()
@@ -55,7 +57,7 @@ def cn_college_mcq_doc_to_audio(doc):
             elif hasattr(audio_array, "detach"):
                 audio_array = audio_array.detach().cpu().numpy()
                 eval_logger.debug("Converted from torch tensor (detach)")
-            
+
             # Ensure it's numpy array
             if not isinstance(audio_array, np.ndarray):
                 try:
@@ -65,17 +67,17 @@ def cn_college_mcq_doc_to_audio(doc):
                     eval_logger.error(f"Failed to convert to numpy array: {e}")
                     eval_logger.error(f"audio_array type: {type(audio_array)}, value: {audio_array}")
                     return []
-            
+
             # Flatten if multi-dimensional
             if audio_array.ndim > 1:
                 audio_array = audio_array.flatten()
                 eval_logger.debug(f"Flattened to shape: {audio_array.shape}")
-            
+
             # Ensure float32 dtype
             if audio_array.dtype != np.float32:
                 audio_array = audio_array.astype(np.float32)
                 eval_logger.debug(f"Converted to float32, dtype: {audio_array.dtype}")
-            
+
             # Get sampling rate
             sampling_rate = 16000  # default
             if hasattr(decoded_audio, "sample_rate"):
@@ -84,23 +86,23 @@ def cn_college_mcq_doc_to_audio(doc):
                 sampling_rate = decoded_audio.sampling_rate
             elif hasattr(audio_file, "_desired_sample_rate"):
                 sampling_rate = audio_file._desired_sample_rate
-            
+
             eval_logger.debug(f"Final audio shape: {audio_array.shape}, sampling_rate: {sampling_rate}")
-            
+
             return [{"array": audio_array, "sampling_rate": sampling_rate}]
-        
+
         # Handle dict-like audio (standard HF format)
         elif isinstance(audio_file, dict):
             if "array" in audio_file and "sampling_rate" in audio_file:
                 return [audio_file]
-        
+
         # Handle direct array
         elif isinstance(audio_file, (list, np.ndarray)):
             return [{"array": np.array(audio_file, dtype=np.float32), "sampling_rate": 16000}]
-        
+
         eval_logger.warning(f"Unknown audio type: {type(audio_file)}")
         return []
-        
+
     except Exception as e:
         eval_logger.error(f"Error extracting audio: {e}")
         eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}")
@@ -111,14 +113,14 @@ def cn_college_mcq_doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate prompt for the audio model"""
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     # Get question and choices
     instruction = doc.get("instruction", "")
     choices = doc.get("choices", "")
-    
+
     # Build prompt
     prompt = f"{instruction}\n\n{choices}"
-    
+
     return f"{pre_prompt}{prompt}{post_prompt}"
 
 
@@ -127,19 +129,20 @@ def cn_college_mcq_process_results(doc, results):
     Process results for Chinese College Listening MCQ task.
     Extract the predicted answer and compare with ground truth.
     """
+
     def normalize(text):
         """Normalize text for comparison"""
         if not isinstance(text, str):
             text = str(text)
         return text.lower().strip()
-    
+
     def extract_answer(response):
         """Extract answer from model response"""
         if not response:
             return None
-        
+
         response = normalize(response)
-        
+
         patterns = [
             r"answer\s+is\s+([ABCD])",
             r"answer:\s*([ABCD])",
@@ -152,55 +155,52 @@ def extract_answer(response):
             r"^([ABCD])[\.,。]",
             r"^([ABCD])$",
         ]
-        
+
         for pattern in patterns:
             match = re.search(pattern, response, re.IGNORECASE)
             if match:
                 return match.group(1).upper()
-        
+
         # Check if response starts with A/B/C/D
         for choice in ["A", "B", "C", "D"]:
             if response.startswith(choice.lower()):
                 return choice
-        
+
         return None
-    
+
     # Get ground truth answer and extract the option letter
     ground_truth_raw = doc.get("answer", "").strip()
     # Extract option letter from ground truth (e.g., "(A) Find a place." -> "A")
     ground_truth = extract_answer(ground_truth_raw)
-    
+
     # If extraction failed, try direct match
     if not ground_truth:
         ground_truth = ground_truth_raw.upper()
-    
+
     # Extract predicted answer from first result
     pred = results[0] if results else ""
     predicted_answer = extract_answer(pred)
-    
+
     # Calculate accuracy
     correct = 1 if predicted_answer and predicted_answer == ground_truth else 0
-    
+
     # Calculate failure rate (unable to extract valid answer)
     failure = 1 if predicted_answer is None else 0
-    
+
     eval_logger.debug(f"Ground truth raw: {ground_truth_raw}, extracted: {ground_truth}, Predicted: {predicted_answer}, Correct: {correct}")
-    
-    return {
-        "accuracy": correct,
-        "failure_rate": failure
-    }
+
+    return {"accuracy": correct, "failure_rate": failure}
 
 
 def cn_college_mcq_aggregate_results(results):
     """Aggregate results across all samples"""
     if not results:
         return 0.0
-    
+
     total_count = len(results)
     correct_count = sum(results)
     accuracy = correct_count / total_count if total_count > 0 else 0.0
-    
+
     eval_logger.info(f"CN College MCQ evaluation: {correct_count}/{total_count} correct, accuracy: {accuracy:.4f}")
-    
+
     return accuracy
diff --git a/lmms_eval/tasks/dream_tts_mcq/utils.py b/lmms_eval/tasks/dream_tts_mcq/utils.py
index dfeae3abe..7b267aaae 100644
--- a/lmms_eval/tasks/dream_tts_mcq/utils.py
+++ b/lmms_eval/tasks/dream_tts_mcq/utils.py
@@ -1,8 +1,10 @@
 import os
 import re
+
 import numpy as np
 from loguru import logger as eval_logger
 
+
 def get_column_value(doc, candidates):
     """Helper function to get value from document with multiple possible column names"""
     for candidate in candidates:
@@ -13,22 +15,22 @@ def get_column_value(doc, candidates):
 
 def dream_tts_mcq_doc_to_audio(doc):
     """Extract audio from DREAM TTS MCQ dataset
-    
+
     Dataset uses 'context' field for audio.
     """
     audio_file = doc.get("context")
-    
+
     if not audio_file:
         eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}")
         return []
-    
+
     try:
         # Handle AudioDecoder type (like AMI dataset)
         if str(type(audio_file).__name__) == "AudioDecoder":
             decoded_audio = audio_file.get_all_samples()
-            
+
             eval_logger.debug(f"decoded_audio type: {type(decoded_audio)}, type name: {type(decoded_audio).__name__}")
-            
+
             # Extract array from AudioSamples or similar objects
             if hasattr(decoded_audio, "data"):
                 audio_array = decoded_audio.data
@@ -42,9 +44,9 @@ def dream_tts_mcq_doc_to_audio(doc):
             else:
                 audio_array = decoded_audio
                 eval_logger.debug("Using decoded_audio directly")
-            
+
             eval_logger.debug(f"audio_array type before conversion: {type(audio_array)}")
-            
+
             # Convert torch tensor to numpy if needed
             if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"):
                 audio_array = audio_array.cpu().numpy()
@@ -55,7 +57,7 @@ def dream_tts_mcq_doc_to_audio(doc):
             elif hasattr(audio_array, "detach"):
                 audio_array = audio_array.detach().cpu().numpy()
                 eval_logger.debug("Converted from torch tensor (detach)")
-            
+
             # Ensure it's numpy array
             if not isinstance(audio_array, np.ndarray):
                 try:
@@ -65,17 +67,17 @@ def dream_tts_mcq_doc_to_audio(doc):
                     eval_logger.error(f"Failed to convert to numpy array: {e}")
                     eval_logger.error(f"audio_array type: {type(audio_array)}, value: {audio_array}")
                     return []
-            
+
             # Flatten if multi-dimensional
             if audio_array.ndim > 1:
                 audio_array = audio_array.flatten()
                 eval_logger.debug(f"Flattened to shape: {audio_array.shape}")
-            
+
             # Ensure float32 dtype
             if audio_array.dtype != np.float32:
                 audio_array = audio_array.astype(np.float32)
                 eval_logger.debug(f"Converted to float32, dtype: {audio_array.dtype}")
-            
+
             # Get sampling rate
             sampling_rate = 16000  # default
             if hasattr(decoded_audio, "sample_rate"):
@@ -84,23 +86,23 @@ def dream_tts_mcq_doc_to_audio(doc):
                 sampling_rate = decoded_audio.sampling_rate
             elif hasattr(audio_file, "_desired_sample_rate"):
                 sampling_rate = audio_file._desired_sample_rate
-            
+
             eval_logger.debug(f"Final audio shape: {audio_array.shape}, sampling_rate: {sampling_rate}")
-            
+
             return [{"array": audio_array, "sampling_rate": sampling_rate}]
-        
+
         # Handle dict-like audio (standard HF format)
         elif isinstance(audio_file, dict):
             if "array" in audio_file and "sampling_rate" in audio_file:
                 return [audio_file]
-        
+
         # Handle direct array
         elif isinstance(audio_file, (list, np.ndarray)):
             return [{"array": np.array(audio_file, dtype=np.float32), "sampling_rate": 16000}]
-        
+
         eval_logger.warning(f"Unknown audio type: {type(audio_file)}")
         return []
-        
+
     except Exception as e:
         eval_logger.error(f"Error extracting audio: {e}")
         eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}")
@@ -111,14 +113,14 @@ def dream_tts_mcq_doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate prompt for the audio model"""
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     # Get question and choices
     instruction = doc.get("instruction", "")
     choices = doc.get("choices", "")
-    
+
     # Build prompt
     prompt = f"{instruction}\n\n{choices}"
-    
+
     return f"{pre_prompt}{prompt}{post_prompt}"
 
 
@@ -127,19 +129,20 @@ def dream_tts_mcq_process_results(doc, results):
     Process results for DREAM TTS MCQ task.
     Extract the predicted answer and compare with ground truth.
     """
+
     def normalize(text):
         """Normalize text for comparison"""
         if not isinstance(text, str):
             text = str(text)
         return text.lower().strip()
-    
+
     def extract_answer(response):
         """Extract answer from model response"""
         if not response:
             return None
-        
+
         response = normalize(response)
-        
+
         patterns = [
             r"answer\s+is\s+([ABCD])",
             r"answer:\s*([ABCD])",
@@ -152,55 +155,52 @@ def extract_answer(response):
             r"^([ABCD])[\.,。]",
             r"^([ABCD])$",
         ]
-        
+
         for pattern in patterns:
             match = re.search(pattern, response, re.IGNORECASE)
             if match:
                 return match.group(1).upper()
-        
+
         # Check if response starts with A/B/C/D
         for choice in ["A", "B", "C", "D"]:
             if response.startswith(choice.lower()):
                 return choice
-        
+
         return None
-    
+
     # Get ground truth answer and extract the option letter
     ground_truth_raw = doc.get("answer", "").strip()
     # Extract option letter from ground truth (e.g., "(A) Find a place." -> "A")
     ground_truth = extract_answer(ground_truth_raw)
-    
+
     # If extraction failed, try direct match
     if not ground_truth:
         ground_truth = ground_truth_raw.upper()
-    
+
     # Extract predicted answer from first result
     pred = results[0] if results else ""
     predicted_answer = extract_answer(pred)
-    
+
     # Calculate accuracy
     correct = 1 if predicted_answer and predicted_answer == ground_truth else 0
-    
+
     # Calculate failure rate (unable to extract valid answer)
     failure = 1 if predicted_answer is None else 0
-    
+
     eval_logger.debug(f"Ground truth raw: {ground_truth_raw}, extracted: {ground_truth}, Predicted: {predicted_answer}, Correct: {correct}")
-    
-    return {
-        "accuracy": correct,
-        "failure_rate": failure
-    }
+
+    return {"accuracy": correct, "failure_rate": failure}
 
 
 def dream_tts_mcq_aggregate_results(results):
     """Aggregate results across all samples"""
     if not results:
         return 0.0
-    
+
     total_count = len(results)
     correct_count = sum(results)
     accuracy = correct_count / total_count if total_count > 0 else 0.0
-    
+
     eval_logger.info(f"DREAM TTS MCQ evaluation: {correct_count}/{total_count} correct, accuracy: {accuracy:.4f}")
-    
+
     return accuracy
diff --git a/lmms_eval/tasks/europal_asr/utils.py b/lmms_eval/tasks/europal_asr/utils.py
index 8fab52eb2..b8d74607e 100644
--- a/lmms_eval/tasks/europal_asr/utils.py
+++ b/lmms_eval/tasks/europal_asr/utils.py
@@ -1,5 +1,6 @@
 import os
 import re
+
 import numpy as np
 from loguru import logger as eval_logger
 
@@ -14,22 +15,22 @@ def get_column_value(doc, candidates):
 
 def europal_asr_doc_to_audio(doc):
     """Extract audio from europal-asr dataset document
-    
+
     Returns audio array and sampling rate (16kHz for europal-asr).
     """
     audio_file = doc.get("audio")
-    
+
     if not audio_file:
         eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}")
         return []
-    
+
     try:
         # Handle AudioDecoder type with get_all_samples() method
         if hasattr(audio_file, "get_all_samples"):
             decoded_audio = audio_file.get_all_samples()
         else:
             decoded_audio = audio_file
-        
+
         # Extract array - check for data attribute first (AudioSamples object)
         if hasattr(decoded_audio, "data"):
             # AudioSamples object from torchcodec
@@ -45,13 +46,13 @@ def europal_asr_doc_to_audio(doc):
             audio_array = decoded_audio.array
         else:
             audio_array = decoded_audio
-        
+
         # Convert torch tensor to numpy if needed
         if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"):
             audio_array = audio_array.cpu().numpy()
         elif hasattr(audio_array, "numpy"):
             audio_array = audio_array.numpy()
-        
+
         # Ensure it's a numpy array and flatten if needed
         if not isinstance(audio_array, np.ndarray):
             try:
@@ -63,22 +64,22 @@ def europal_asr_doc_to_audio(doc):
                     audio_array = np.array(audio_array.tolist())
                 else:
                     raise
-        
+
         # Ensure it's 1D array (flatten if multi-channel)
         if audio_array.ndim > 1:
             audio_array = audio_array.flatten()
-        
+
         # Ensure float32 dtype for librosa compatibility
         if audio_array.dtype != np.float32:
             audio_array = audio_array.astype(np.float32)
-        
+
         # Get sampling rate (europal-asr is 16kHz)
         sampling_rate = getattr(audio_file, "_desired_sample_rate", 16000)
-        
+
         eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}")
-        
+
         return [{"array": audio_array, "sampling_rate": sampling_rate}]
-        
+
     except Exception as e:
         eval_logger.error(f"Error extracting audio: {e}")
         eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}")
@@ -89,10 +90,10 @@ def europal_asr_doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate prompt for the audio model"""
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     # Default prompt for speech recognition
     default_prompt = "Please transcribe the following audio. Only provide the transcription without any additional explanation or formatting."
-    
+
     return f"{pre_prompt}{default_prompt}{post_prompt}"
 
 
@@ -102,27 +103,27 @@ def europal_asr_process_results_asr(doc, results):
     Calculates Word Error Rate (WER) using a simple implementation.
     """
     scores = []
-    
+
     # Get ground truth
     ground_truth = get_column_value(doc, ["text_verbatim", "transcript", "transcription"])
     if not ground_truth:
         eval_logger.warning("No ground truth text found in document")
         return {"wer": 1.0}
-    
+
     ground_truth = ground_truth.strip().upper()
-    
+
     for pred in results:
         prediction = pred.strip() if isinstance(pred, str) else str(pred)
         prediction = prediction.strip()
-        
+
         # Extract transcription from various formats
         prediction = extract_transcription(prediction)
         prediction = prediction.upper()
-        
+
         # Calculate Word Error Rate
         wer = calculate_wer(ground_truth, prediction)
         scores.append(wer)
-    
+
     avg_wer = sum(scores) / len(scores) if scores else 1.0
     return {"wer": avg_wer}
 
@@ -134,9 +135,9 @@ def extract_transcription(text):
     """
     if not isinstance(text, str):
         return str(text)
-    
+
     text = text.strip()
-    
+
     # Pattern 1: XML-style tags
     for tag in ["<answer>", "<response>", "<result>", "<transcription>", "<text>"]:
         closing_tag = tag.replace("<", "</")
@@ -144,40 +145,37 @@ def extract_transcription(text):
         match = re.search(pattern, text, re.IGNORECASE)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 2: "The original content of this audio is:" followed by text in quotes
     patterns = [
         r"(?:the\s+)?original\s+content\s+(?:of\s+)?(?:this\s+)?audio\s+is\s*:\s*['\"](.+?)['\"]\s*$",
         r"(?:the\s+)?(?:audio|speech)\s+(?:content|transcription|says)\s*:\s*['\"](.+?)['\"]\s*$",
         r"transcription\s*:\s*['\"](.+?)['\"]\s*$",
     ]
-    
+
     for pattern in patterns:
         match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 3: Text enclosed in quotes (single or double)
-    quote_patterns = [
-        r"^['\"](.+?)['\"]$",  # Entire text in quotes
-        r"['\"]([^'\"]{20,})['\"]"  # Long text in quotes (at least 20 chars)
-    ]
-    
+    quote_patterns = [r"^['\"](.+?)['\"]$", r"['\"]([^'\"]{20,})['\"]"]  # Entire text in quotes  # Long text in quotes (at least 20 chars)
+
     for pattern in quote_patterns:
         match = re.search(pattern, text, re.DOTALL)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 4: Remove common prefixes
     prefixes_to_remove = [
         r"^(?:here\s+is\s+)?(?:the\s+)?transcription\s*:\s*",
         r"^(?:the\s+)?(?:audio|speech)\s+(?:says|contains)\s*:\s*",
         r"^(?:answer|response|result)\s*:\s*",
     ]
-    
+
     for prefix in prefixes_to_remove:
         text = re.sub(prefix, "", text, flags=re.IGNORECASE)
-    
+
     return text.strip()
 
 
@@ -194,17 +192,17 @@ def calculate_wer(reference, hypothesis):
     # Split into words
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    
+
     # Build edit distance matrix
     n, m = len(ref_words), len(hyp_words)
     dp = [[0] * (m + 1) for _ in range(n + 1)]
-    
+
     # Initialize
     for i in range(n + 1):
         dp[i][0] = i
     for j in range(m + 1):
         dp[0][j] = j
-    
+
     # Fill matrix
     for i in range(1, n + 1):
         for j in range(1, m + 1):
@@ -215,10 +213,10 @@ def calculate_wer(reference, hypothesis):
                 insertion = dp[i][j - 1] + 1
                 deletion = dp[i - 1][j] + 1
                 dp[i][j] = min(substitution, insertion, deletion)
-    
+
     # Calculate WER
     if n == 0:
         return 0.0 if m == 0 else 1.0
-    
+
     wer = dp[n][m] / n
     return wer
diff --git a/lmms_eval/tasks/ice_bench/README.md b/lmms_eval/tasks/ice_bench/README.md
new file mode 100644
index 000000000..eb5abdb0f
--- /dev/null
+++ b/lmms_eval/tasks/ice_bench/README.md
@@ -0,0 +1,9 @@
+# ICE-Bench
+
+This task folder provides a lightweight ICE-Bench integration path for smoke validation.
+
+- Task: `ice_bench`
+- Source: official ICE-Bench dataset payload format (`ali-vilab/ICE-Bench`)
+- Dataset file expected by YAML: `/tmp/ice_bench_smoke.jsonl`
+
+`examples/models/openrouter_ice_smoke.sh` can bootstrap one official sample into that file and run end-to-end image generation/editing smoke with local artifact saving.
diff --git a/lmms_eval/tasks/ice_bench/ice_bench.yaml b/lmms_eval/tasks/ice_bench/ice_bench.yaml
new file mode 100644
index 000000000..1faa58e85
--- /dev/null
+++ b/lmms_eval/tasks/ice_bench/ice_bench.yaml
@@ -0,0 +1,23 @@
+dataset_path: json
+dataset_kwargs:
+  data_files:
+    train: /tmp/ice_bench_smoke.jsonl
+
+task: "ice_bench"
+test_split: train
+output_type: generate_until
+
+doc_to_visual: !function utils.ice_doc_to_visual
+doc_to_text: !function utils.ice_doc_to_text
+doc_to_target: !function utils.ice_doc_to_target
+
+process_results: !function utils.ice_process_results
+
+metric_list:
+  - metric: artifact_saved
+    aggregation: mean
+    higher_is_better: true
+
+metadata:
+  - version: 0.1
+    description: "ICE-Bench single-sample smoke using official dataset payload"
diff --git a/lmms_eval/tasks/ice_bench/utils.py b/lmms_eval/tasks/ice_bench/utils.py
new file mode 100644
index 000000000..b925e3bb2
--- /dev/null
+++ b/lmms_eval/tasks/ice_bench/utils.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+import json
+import os
+from typing import Any
+
+from PIL import Image
+
+
+def ice_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
+    src = doc.get("source_image", "")
+    if isinstance(src, str) and src and os.path.exists(src):
+        return [Image.open(src).convert("RGB")]
+    return []
+
+
+def ice_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: dict[str, Any] | None = None) -> str:
+    instruction = str(doc.get("instruction", "")).strip()
+    if lmms_eval_specific_kwargs:
+        pre_prompt = str(lmms_eval_specific_kwargs.get("pre_prompt", ""))
+        post_prompt = str(lmms_eval_specific_kwargs.get("post_prompt", ""))
+        return f"{pre_prompt}{instruction}{post_prompt}"
+    return instruction
+
+
+def ice_doc_to_target(doc: dict[str, Any]) -> str:
+    return str(doc.get("instruction", ""))
+
+
+def ice_process_results(doc: dict[str, Any], results: list[str]) -> dict[str, float]:
+    if not results:
+        return {"artifact_saved": 0.0}
+
+    raw = results[0]
+    try:
+        parsed = json.loads(raw)
+    except (json.JSONDecodeError, TypeError):
+        return {"artifact_saved": 0.0}
+
+    images = parsed.get("images", []) if isinstance(parsed, dict) else []
+    if not isinstance(images, list) or not images:
+        return {"artifact_saved": 0.0}
+
+    first = images[0]
+    if isinstance(first, str) and os.path.exists(first):
+        return {"artifact_saved": 1.0}
+    return {"artifact_saved": 0.0}
diff --git a/lmms_eval/tasks/song_describer/utils.py b/lmms_eval/tasks/song_describer/utils.py
index 4ac206141..41726827a 100644
--- a/lmms_eval/tasks/song_describer/utils.py
+++ b/lmms_eval/tasks/song_describer/utils.py
@@ -20,22 +20,22 @@ def get_column_value(doc, candidates):
 
 def song_describer_doc_to_audio(doc):
     """Extract audio from song-describer dataset document
-    
+
     Returns audio array and sampling rate (16kHz for song-describer).
     """
     audio_file = doc.get("audio_path") or doc.get("audio")
-    
+
     if not audio_file:
         eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}")
         return []
-    
+
     try:
         # Handle different audio formats
         if hasattr(audio_file, "get_all_samples"):
             decoded_audio = audio_file.get_all_samples()
         else:
             decoded_audio = audio_file
-        
+
         # Extract array
         if hasattr(decoded_audio, "data"):
             audio_array = decoded_audio.data
@@ -49,13 +49,13 @@ def song_describer_doc_to_audio(doc):
                 audio_array = temp
         else:
             audio_array = decoded_audio
-        
+
         # Convert torch tensor to numpy if needed
         if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"):
             audio_array = audio_array.cpu().numpy()
         elif hasattr(audio_array, "numpy"):
             audio_array = audio_array.numpy()
-        
+
         # Ensure it's a numpy array
         if not isinstance(audio_array, np.ndarray):
             try:
@@ -66,22 +66,22 @@ def song_describer_doc_to_audio(doc):
                     audio_array = np.array(audio_array.tolist())
                 else:
                     raise
-        
+
         # Ensure it's 1D array (flatten if multi-channel)
         if audio_array.ndim > 1:
             audio_array = audio_array.flatten()
-        
+
         # Ensure float32 dtype
         if audio_array.dtype != np.float32:
             audio_array = audio_array.astype(np.float32)
-        
+
         # Get sampling rate (song-describer is 16kHz)
         sampling_rate = getattr(audio_file, "_desired_sample_rate", 16000)
-        
+
         eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}")
-        
+
         return [{"array": audio_array, "sampling_rate": sampling_rate}]
-        
+
     except Exception as e:
         eval_logger.error(f"Error extracting audio: {e}")
         eval_logger.error(f"Audio type: {type(audio_file)}, attributes: {dir(audio_file)}")
@@ -92,10 +92,10 @@ def song_describer_doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate prompt for music captioning"""
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     # Default prompt for music captioning
     default_prompt = "Listen to this music and describe what you hear. Include details about the instruments, genre, mood, and any distinctive musical elements."
-    
+
     return f"{pre_prompt}{default_prompt}{post_prompt}"
 
 
@@ -122,20 +122,16 @@ def song_describer_doc_to_text(doc, lmms_eval_specific_kwargs):
 def get_eval_model():
     """Lazy load evaluation model"""
     global _eval_model, _eval_tokenizer
-    
+
     if _eval_model is None:
         eval_logger.info(f"Loading evaluation model: {EVAL_MODEL_NAME}")
         _eval_tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL_NAME, trust_remote_code=True)
-        _eval_model = AutoModelForCausalLM.from_pretrained(
-            EVAL_MODEL_NAME,
-            torch_dtype=torch.bfloat16,
-            device_map="auto",
-            trust_remote_code=True
-        ).eval()
+        _eval_model = AutoModelForCausalLM.from_pretrained(EVAL_MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True).eval()
         eval_logger.info("Evaluation model loaded successfully")
-    
+
     return _eval_model, _eval_tokenizer
 
+
 eval_prompt = """
             [Music Description Task]
             Reference Description: {ground_truth}
@@ -169,23 +165,19 @@ def get_eval_model():
 def get_eval(max_tokens: int, content: str):
     """Call local Qwen model for evaluation"""
     model, tokenizer = get_eval_model()
-    
+
     messages = [
         {"role": "system", "content": "You are a professional music critic and evaluator. Provide objective and detailed assessments."},
         {"role": "user", "content": content},
     ]
-    
+
     try:
         # Format messages using chat template
-        text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        
+        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
         # Tokenize and generate
         model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-        
+
         with torch.no_grad():
             generated_ids = model.generate(
                 **model_inputs,
@@ -194,20 +186,18 @@ def get_eval(max_tokens: int, content: str):
                 do_sample=True,
                 top_p=0.9,
             )
-        
+
         # Decode only the generated part (excluding input)
-        generated_ids = [
-            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-        ]
-        
+        generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
+
         response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-        
+
         if response:
             return response, EVAL_MODEL_NAME
-        
+
         eval_logger.warning("Empty response from evaluation model")
         return "", ""
-        
+
     except Exception as e:
         eval_logger.error(f"Error during evaluation: {e}")
         return "", ""
@@ -217,18 +207,15 @@ def song_describer_process_results(doc, results):
     """Process results for song-describer captioning task"""
     pred = results[0]
     ground_truth_str = get_column_value(doc, ["text", "caption", "description"])
-    
+
     if not ground_truth_str:
         eval_logger.warning("No ground truth text found in document")
         return {"eval_score": {"eval_answer": "", "model_name": ""}}
-    
-    content = eval_prompt.format(
-        model_response=pred,
-        ground_truth=ground_truth_str
-    )
-    
+
+    content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str)
+
     eval_answer, model_name = get_eval(max_tokens=1024, content=content)
-    
+
     return {
         "eval_score": {"eval_answer": eval_answer, "model_name": model_name},
     }
@@ -238,10 +225,10 @@ def song_describer_aggregate_results(results):
     """Aggregate evaluation scores from local model"""
     score = 0
     valid_count = 0
-    
+
     for result in results:
         eval_answer = result["eval_answer"]
-        
+
         if not eval_answer:
             continue
 
@@ -258,5 +245,5 @@ def song_describer_aggregate_results(results):
     if valid_count == 0:
         eval_logger.error("No valid evaluation scores found")
         return 0.0
-    
+
     return score / valid_count
diff --git a/lmms_eval/tasks/voxpopuli/utils.py b/lmms_eval/tasks/voxpopuli/utils.py
index b5ae95ba4..f7712bb36 100644
--- a/lmms_eval/tasks/voxpopuli/utils.py
+++ b/lmms_eval/tasks/voxpopuli/utils.py
@@ -1,15 +1,12 @@
 import os
 import re
-import numpy as np
 from collections import defaultdict
+
+import numpy as np
 from loguru import logger as eval_logger
 
 # Language code mapping (voxpopuli uses integer codes)
-LANGUAGE_MAP = {
-    0: "en", 1: "de", 2: "fr", 3: "es", 4: "pl", 5: "it",
-    6: "ro", 7: "hu", 8: "cs", 9: "nl", 10: "fi", 11: "hr",
-    12: "sk", 13: "sl", 14: "et", 15: "lt"
-}
+LANGUAGE_MAP = {0: "en", 1: "de", 2: "fr", 3: "es", 4: "pl", 5: "it", 6: "ro", 7: "hu", 8: "cs", 9: "nl", 10: "fi", 11: "hr", 12: "sk", 13: "sl", 14: "et", 15: "lt"}
 
 
 def _fallback_silent_audio(sampling_rate: int = 16000):
@@ -26,7 +23,7 @@ def get_column_value(doc, candidates):
 
 def voxpopuli_doc_to_audio(doc):
     """Extract audio from VoxPopuli dataset document
-    
+
     VoxPopuli dataset structure:
     {
       'audio': {
@@ -38,11 +35,11 @@ def voxpopuli_doc_to_audio(doc):
     }
     """
     audio_data = doc.get("audio")
-    
+
     if not audio_data:
         eval_logger.warning(f"No audio found in document. Available keys: {list(doc.keys())}")
         return _fallback_silent_audio()
-    
+
     try:
         # VoxPopuli audio is already a dict with 'array' and 'sampling_rate'
         if isinstance(audio_data, dict):
@@ -51,7 +48,7 @@ def voxpopuli_doc_to_audio(doc):
         else:
             # If it's an AudioDecoder object
             decoded_audio = audio_data.get_all_samples() if hasattr(audio_data, "get_all_samples") else audio_data
-            
+
             # Extract array - check for data attribute first (AudioSamples object from torchcodec)
             if hasattr(decoded_audio, "data"):
                 # AudioSamples object from torchcodec
@@ -67,19 +64,19 @@ def voxpopuli_doc_to_audio(doc):
                 audio_array = decoded_audio.array
             else:
                 audio_array = decoded_audio
-            
+
             sampling_rate = getattr(audio_data, "_desired_sample_rate", 16000)
-        
+
         # Convert torch tensor to numpy if needed
         if hasattr(audio_array, "cpu") and hasattr(audio_array, "numpy"):
             audio_array = audio_array.cpu().numpy()
         elif hasattr(audio_array, "numpy"):
             audio_array = audio_array.numpy()
-        
+
         # Ensure it's a numpy array
         if not isinstance(audio_array, np.ndarray):
             audio_array = np.array(audio_array)
-        
+
         # Ensure it's 1D array (flatten if multi-channel)
         if audio_array.ndim > 1:
             audio_array = audio_array.flatten()
@@ -87,15 +84,15 @@ def voxpopuli_doc_to_audio(doc):
         if audio_array.size == 0:
             eval_logger.warning("Decoded audio is empty, using 1s silent fallback audio")
             return _fallback_silent_audio(int(sampling_rate) if sampling_rate else 16000)
-        
+
         # Ensure float32 dtype
         if audio_array.dtype != np.float32:
             audio_array = audio_array.astype(np.float32)
-        
+
         eval_logger.debug(f"Audio array shape: {audio_array.shape}, dtype: {audio_array.dtype}, sampling_rate: {sampling_rate}")
-        
+
         return [{"array": audio_array, "sampling_rate": sampling_rate}]
-        
+
     except Exception as e:
         eval_logger.error(f"Error extracting audio: {e}")
         eval_logger.error(f"Audio type: {type(audio_data)}, attributes: {dir(audio_data)}")
@@ -106,14 +103,14 @@ def voxpopuli_doc_to_text(doc, lmms_eval_specific_kwargs):
     """Generate prompt for the audio model"""
     pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
     post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
-    
+
     # Get language information
     language_code = doc.get("language", 0)
     language = LANGUAGE_MAP.get(language_code, "unknown")
-    
+
     # Default prompt for speech recognition
     default_prompt = f"Please transcribe the following audio in {language}."
-    
+
     return f"{pre_prompt}{default_prompt}{post_prompt}"
 
 
@@ -123,38 +120,34 @@ def voxpopuli_process_results_asr(doc, results):
     Calculates Word Error Rate (WER) and stores language information for aggregation.
     """
     scores = []
-    
+
     # Get ground truth
     ground_truth = get_column_value(doc, ["normalized_text", "text", "transcript"])
     if not ground_truth:
         eval_logger.warning("No ground truth text found in document")
         return {"wer": 1.0, "language": doc.get("language", 0)}
-    
+
     ground_truth = ground_truth.strip().lower()  # VoxPopuli uses lowercase normalized text
-    
+
     # Get language for later aggregation
     language_code = doc.get("language", 0)
-    
+
     for pred in results:
         prediction = pred.strip() if isinstance(pred, str) else str(pred)
         prediction = prediction.strip()
-        
+
         # Extract transcription from various formats
         prediction = extract_transcription(prediction)
         prediction = prediction.lower()
-        
+
         # Calculate Word Error Rate
         wer = calculate_wer(ground_truth, prediction)
         scores.append(wer)
-    
+
     avg_wer = sum(scores) / len(scores) if scores else 1.0
-    
+
     # Return both WER and language for aggregation
-    return {
-        "wer": avg_wer,
-        "language": language_code,
-        "wer_by_language": {language_code: avg_wer}
-    }
+    return {"wer": avg_wer, "language": language_code, "wer_by_language": {language_code: avg_wer}}
 
 
 def extract_transcription(text):
@@ -164,9 +157,9 @@ def extract_transcription(text):
     """
     if not isinstance(text, str):
         return str(text)
-    
+
     text = text.strip()
-    
+
     # Pattern 1: XML-style tags
     for tag in ["<answer>", "<response>", "<result>", "<transcription>", "<text>"]:
         closing_tag = tag.replace("<", "</")
@@ -174,40 +167,37 @@ def extract_transcription(text):
         match = re.search(pattern, text, re.IGNORECASE)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 2: "The original content of this audio is:" followed by text in quotes
     patterns = [
         r"(?:the\s+)?original\s+content\s+(?:of\s+)?(?:this\s+)?audio\s+is\s*:\s*['\"](.+?)['\"]\s*$",
         r"(?:the\s+)?(?:audio|speech)\s+(?:content|transcription|says)\s*:\s*['\"](.+?)['\"]\s*$",
         r"transcription\s*:\s*['\"](.+?)['\"]\s*$",
     ]
-    
+
     for pattern in patterns:
         match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 3: Text enclosed in quotes (single or double)
-    quote_patterns = [
-        r"^['\"](.+?)['\"]$",  # Entire text in quotes
-        r"['\"]([^'\"]{20,})['\"]"  # Long text in quotes (at least 20 chars)
-    ]
-    
+    quote_patterns = [r"^['\"](.+?)['\"]$", r"['\"]([^'\"]{20,})['\"]"]  # Entire text in quotes  # Long text in quotes (at least 20 chars)
+
     for pattern in quote_patterns:
         match = re.search(pattern, text, re.DOTALL)
         if match:
             return match.group(1).strip()
-    
+
     # Pattern 4: Remove common prefixes
     prefixes_to_remove = [
         r"^(?:here\s+is\s+)?(?:the\s+)?transcription\s*:\s*",
         r"^(?:the\s+)?(?:audio|speech)\s+(?:says|contains)\s*:\s*",
         r"^(?:answer|response|result)\s*:\s*",
     ]
-    
+
     for prefix in prefixes_to_remove:
         text = re.sub(prefix, "", text, flags=re.IGNORECASE)
-    
+
     return text.strip()
 
 
@@ -224,28 +214,28 @@ def calculate_wer(reference, hypothesis):
     # Split into words
     ref_words = reference.split()
     hyp_words = hypothesis.split()
-    
+
     # Build edit distance matrix
     n, m = len(ref_words), len(hyp_words)
     dp = [[0] * (m + 1) for _ in range(n + 1)]
-    
+
     # Initialize
     for i in range(n + 1):
         dp[i][0] = i
     for j in range(m + 1):
         dp[0][j] = j
-    
+
     # Dynamic programming
     for i in range(1, n + 1):
         for j in range(1, m + 1):
-            if ref_words[i-1] == hyp_words[j-1]:
-                dp[i][j] = dp[i-1][j-1]
+            if ref_words[i - 1] == hyp_words[j - 1]:
+                dp[i][j] = dp[i - 1][j - 1]
             else:
-                substitution = dp[i-1][j-1] + 1
-                insertion = dp[i][j-1] + 1
-                deletion = dp[i-1][j] + 1
+                substitution = dp[i - 1][j - 1] + 1
+                insertion = dp[i][j - 1] + 1
+                deletion = dp[i - 1][j] + 1
                 dp[i][j] = min(substitution, insertion, deletion)
-    
+
     # Calculate WER
     wer = dp[n][m] / max(n, 1)  # Avoid division by zero
     return wer
@@ -255,15 +245,15 @@ def voxpopuli_aggregate_results_overall(results):
     """Aggregate overall WER results for VoxPopuli evaluation"""
     if not results:
         return 0.0
-    
+
     total_count = len(results)
-    
+
     # Calculate overall average WER
     if all(isinstance(r, (int, float)) for r in results):
         avg_wer = sum(results) / total_count
         eval_logger.info(f"VoxPopuli Overall WER: {avg_wer:.4f}")
         return avg_wer
-    
+
     return 0.0
 
 
@@ -274,17 +264,17 @@ def voxpopuli_aggregate_results_by_language(results):
     """
     if not results:
         return {}
-    
+
     # Group results by language
     language_results = defaultdict(list)
-    
+
     for result in results:
         if isinstance(result, dict) and "wer_by_language" in result:
             for lang_code, wer in result["wer_by_language"].items():
                 language_results[lang_code].append(wer)
         elif isinstance(result, dict) and "language" in result and "wer" in result:
             language_results[result["language"]].append(result["wer"])
-    
+
     # Calculate average WER per language
     language_wer = {}
     for lang_code, wer_scores in language_results.items():
@@ -292,12 +282,12 @@ def voxpopuli_aggregate_results_by_language(results):
         lang_name = LANGUAGE_MAP.get(lang_code, f"lang_{lang_code}")
         language_wer[lang_name] = avg_wer
         eval_logger.info(f"VoxPopuli WER for {lang_name}: {avg_wer:.4f} ({len(wer_scores)} samples)")
-    
+
     # Also log overall average
     all_wers = [wer for wers in language_results.values() for wer in wers]
     if all_wers:
         overall_avg = sum(all_wers) / len(all_wers)
         eval_logger.info(f"VoxPopuli Overall Average WER: {overall_avg:.4f}")
         language_wer["overall"] = overall_avg
-    
+
     return language_wer