Skip to content

Commit ddd3b98

Browse files
feat: add ICE-Bench OpenRouter smoke pipeline (#1198)
* feat: add ICE-Bench OpenRouter image smoke pipeline * style: auto-fix lint (black + isort) --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 0139667 commit ddd3b98

7 files changed

Lines changed: 361 additions & 0 deletions

File tree

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
export OPENROUTER_API_KEY="${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}"
6+
7+
MODEL_VERSION="${MODEL_VERSION:-google/gemini-2.5-flash-image}"
8+
TASKS="${TASKS:-ice_bench}"
9+
LIMIT="${LIMIT:-1}"
10+
OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_ice_smoke}"
11+
IMAGE_OUTPUT_DIR="${IMAGE_OUTPUT_DIR:-./logs/openrouter_ice_images}"
12+
USE_OFFICIAL_ICE_SAMPLE="${USE_OFFICIAL_ICE_SAMPLE:-1}"
13+
14+
mkdir -p "${OUTPUT_PATH}" "${IMAGE_OUTPUT_DIR}"
15+
16+
if [[ "${USE_OFFICIAL_ICE_SAMPLE}" == "1" ]]; then
17+
uv run python - <<'PY'
18+
import json
19+
import zipfile
20+
from pathlib import Path
21+
22+
from huggingface_hub import hf_hub_download
23+
24+
zip_path = hf_hub_download(
25+
repo_id="ali-vilab/ICE-Bench",
26+
repo_type="dataset",
27+
filename="dataset.zip",
28+
token=False,
29+
)
30+
31+
target_jsonl = Path("/tmp/ice_bench_smoke.jsonl")
32+
target_dir = Path("/tmp/ice_bench_smoke_data")
33+
target_dir.mkdir(parents=True, exist_ok=True)
34+
35+
with zipfile.ZipFile(zip_path) as zf:
36+
with zf.open("data/data.jsonl") as fh:
37+
first = json.loads(next(fh))
38+
39+
src_rel = first["SourceImage"]
40+
instruction = first["Instruction"]
41+
item_id = first["ItemID"]
42+
43+
src_out = target_dir / f"{item_id}_src.png"
44+
with zf.open(src_rel) as src_in:
45+
src_out.write_bytes(src_in.read())
46+
47+
record = {
48+
"item_id": item_id,
49+
"instruction": instruction,
50+
"source_image": str(src_out),
51+
}
52+
target_jsonl.write_text(json.dumps(record, ensure_ascii=False) + "\n", encoding="utf-8")
53+
print(f"Prepared smoke data at {target_jsonl}")
54+
print(f"Source image at {src_out}")
55+
PY
56+
fi
57+
58+
echo "[INFO] Running ICE smoke with model=${MODEL_VERSION} tasks=${TASKS}"
59+
60+
uv run python -m lmms_eval \
61+
--model openrouter_image_gen \
62+
--model_args "model_version=${MODEL_VERSION},output_dir=${IMAGE_OUTPUT_DIR},max_new_tokens=4096,image_size=1024x1024" \
63+
--tasks "${TASKS}" \
64+
--batch_size 1 \
65+
--limit "${LIMIT}" \
66+
--output_path "${OUTPUT_PATH}" \
67+
--log_samples \
68+
--verbosity INFO
69+
70+
echo "[INFO] Done. Generated images in ${IMAGE_OUTPUT_DIR}/ice_bench"
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
export OPENAI_API_KEY="${OPENAI_API_KEY:-${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}}"
6+
export OPENAI_API_BASE="${OPENAI_API_BASE:-https://openrouter.ai/api/v1}"
7+
8+
MODEL_VERSION="${MODEL_VERSION:-google/gemini-2.5-flash-image}"
9+
TASKS="${TASKS:-ice_bench}"
10+
LIMIT="${LIMIT:-1}"
11+
OUTPUT_PATH="${OUTPUT_PATH:-./logs/openrouter_image_smoke}"
12+
IMAGE_OUTPUT_DIR="${IMAGE_OUTPUT_DIR:-./logs/openrouter_image_outputs}"
13+
14+
echo "[INFO] OpenRouter image smoke"
15+
echo "[INFO] model=${MODEL_VERSION} tasks=${TASKS} limit=${LIMIT}"
16+
echo "[INFO] output_path=${OUTPUT_PATH} image_output_dir=${IMAGE_OUTPUT_DIR}"
17+
18+
uv run python -m lmms_eval \
19+
--model openrouter_image_gen \
20+
--model_args "model_version=${MODEL_VERSION},output_dir=${IMAGE_OUTPUT_DIR},max_new_tokens=900,image_size=1024x1024" \
21+
--tasks "${TASKS}" \
22+
--batch_size 1 \
23+
--limit "${LIMIT}" \
24+
--output_path "${OUTPUT_PATH}" \
25+
--log_samples \
26+
--process_with_media \
27+
--verbosity INFO
28+
29+
echo "[INFO] Done. Generated images under: ${IMAGE_OUTPUT_DIR}"

lmms_eval/models/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
"ola": "Ola",
6767
"omnivinci": "OmniVinci",
6868
"openai": "OpenAICompatible",
69+
"openrouter_image_gen": "OpenRouterImageGen",
6970
"oryx": "Oryx",
7071
"phi3v": "Phi3v",
7172
"phi4_multimodal": "Phi4",
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
from __future__ import annotations
2+
3+
import base64
4+
import json
5+
import os
6+
import time
7+
from pathlib import Path
8+
from typing import Any, Optional
9+
10+
import requests as http_requests
11+
from PIL import Image
12+
13+
from lmms_eval.api.instance import Instance
14+
from lmms_eval.api.model import lmms
15+
from lmms_eval.api.registry import register_model
16+
17+
18+
@register_model("openrouter_image_gen")
19+
class OpenRouterImageGen(lmms):
20+
is_simple = True
21+
22+
def __init__(
23+
self,
24+
model_version: str = "openai/gpt-5-image-mini",
25+
output_dir: str = "./logs/openrouter_image_gen",
26+
max_new_tokens: int = 1024,
27+
temperature: Optional[float] = None,
28+
image_size: str = "1024x1024",
29+
max_retries: int = 3,
30+
timeout: int = 180,
31+
**_: Any,
32+
) -> None:
33+
super().__init__()
34+
self.model_version = model_version
35+
self.output_dir = output_dir
36+
self.max_new_tokens = max_new_tokens
37+
self.temperature = None if temperature is None else float(temperature)
38+
self.image_size = image_size
39+
self.max_retries = max_retries
40+
self.timeout = timeout
41+
42+
self.api_key = os.getenv("OPENROUTER_API_KEY")
43+
if not self.api_key:
44+
raise EnvironmentError("OPENROUTER_API_KEY is required for openrouter_image_gen")
45+
46+
self.base_url = "https://openrouter.ai/api/v1/chat/completions"
47+
self.session = http_requests.Session()
48+
self.session.headers.update(
49+
{
50+
"Authorization": f"Bearer {self.api_key}",
51+
"Content-Type": "application/json",
52+
}
53+
)
54+
55+
Path(self.output_dir).mkdir(parents=True, exist_ok=True)
56+
57+
def _encode_image(self, image: Image.Image) -> str:
58+
from io import BytesIO
59+
60+
buf = BytesIO()
61+
image.convert("RGB").save(buf, format="PNG")
62+
return base64.b64encode(buf.getvalue()).decode("utf-8")
63+
64+
def _decode_data_url(self, data_url: str) -> bytes:
65+
marker = "base64,"
66+
idx = data_url.find(marker)
67+
if idx == -1:
68+
raise ValueError("Image data URL missing base64 payload")
69+
payload = data_url[idx + len(marker) :]
70+
return base64.b64decode(payload)
71+
72+
def _extract_images(self, payload: dict[str, Any]) -> list[str]:
73+
out: list[str] = []
74+
try:
75+
images = payload["choices"][0]["message"].get("images", [])
76+
except (KeyError, IndexError, TypeError):
77+
return out
78+
79+
for item in images:
80+
if not isinstance(item, dict):
81+
continue
82+
image_url = item.get("image_url", {})
83+
if not isinstance(image_url, dict):
84+
continue
85+
url = image_url.get("url")
86+
if isinstance(url, str) and url.startswith("data:image"):
87+
out.append(url)
88+
return out
89+
90+
def _request_generation(self, prompt: str, visuals: list[Image.Image]) -> dict[str, Any]:
91+
content: list[dict[str, Any]] = [{"type": "text", "text": prompt}]
92+
for img in visuals:
93+
b64 = self._encode_image(img)
94+
content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}})
95+
96+
payload: dict[str, Any] = {
97+
"model": self.model_version,
98+
"messages": [{"role": "user", "content": content}],
99+
"modalities": ["text", "image"],
100+
"image": {"size": self.image_size},
101+
"max_tokens": self.max_new_tokens,
102+
}
103+
if self.temperature is not None:
104+
payload["temperature"] = self.temperature
105+
106+
for attempt in range(1, self.max_retries + 1):
107+
try:
108+
resp = self.session.post(self.base_url, json=payload, timeout=self.timeout)
109+
resp.raise_for_status()
110+
return resp.json()
111+
except http_requests.HTTPError as exc:
112+
detail = ""
113+
if exc.response is not None:
114+
detail = exc.response.text
115+
if attempt == self.max_retries:
116+
raise RuntimeError(f"OpenRouter HTTPError: {detail}") from exc
117+
time.sleep(min(2 * attempt, 8))
118+
except Exception:
119+
if attempt == self.max_retries:
120+
raise
121+
time.sleep(min(2 * attempt, 8))
122+
raise RuntimeError("Unreachable retry loop")
123+
124+
def _save_images(self, image_data_urls: list[str], task: str, doc_id: int) -> list[str]:
125+
task_dir = Path(self.output_dir) / str(task).replace("/", "_")
126+
task_dir.mkdir(parents=True, exist_ok=True)
127+
128+
saved_paths: list[str] = []
129+
for idx, data_url in enumerate(image_data_urls):
130+
raw = self._decode_data_url(data_url)
131+
path = task_dir / f"{doc_id}_{idx}.png"
132+
path.write_bytes(raw)
133+
saved_paths.append(str(path))
134+
return saved_paths
135+
136+
def generate_until(self, requests: list[Instance]) -> list[str]:
137+
outputs: list[str] = []
138+
for req in requests:
139+
args = req.args
140+
if len(args) < 6:
141+
outputs.append(json.dumps({"text": "", "images": []}, ensure_ascii=False))
142+
continue
143+
ctx, gen_kwargs, doc_to_visual, doc_id, task, split = args[:6]
144+
prompt = str(ctx)
145+
local_gen_kwargs = dict(gen_kwargs or {})
146+
147+
visuals_raw = doc_to_visual(self.task_dict[task][split][doc_id])
148+
visuals: list[Image.Image] = []
149+
for item in visuals_raw:
150+
if isinstance(item, Image.Image):
151+
visuals.append(item)
152+
153+
if "max_new_tokens" in local_gen_kwargs:
154+
self.max_new_tokens = int(local_gen_kwargs["max_new_tokens"])
155+
if "temperature" in local_gen_kwargs:
156+
value = local_gen_kwargs["temperature"]
157+
self.temperature = None if value is None else float(value)
158+
159+
try:
160+
data = self._request_generation(prompt=prompt, visuals=visuals)
161+
except Exception:
162+
data = self._request_generation(prompt=prompt, visuals=[])
163+
image_urls = self._extract_images(data)
164+
saved_images = self._save_images(image_urls, task=str(task), doc_id=int(doc_id))
165+
166+
text = ""
167+
try:
168+
text = data["choices"][0]["message"].get("content", "")
169+
except (KeyError, IndexError, TypeError):
170+
text = ""
171+
172+
result = {"text": text, "images": saved_images}
173+
outputs.append(json.dumps(result, ensure_ascii=False))
174+
self.cache_hook.add_partial("generate_until", (ctx, local_gen_kwargs), outputs[-1])
175+
176+
return outputs
177+
178+
def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
179+
raise NotImplementedError("openrouter_image_gen does not support loglikelihood")
180+
181+
def generate_until_multi_round(self, requests: list[Instance]) -> list[str]:
182+
raise NotImplementedError("openrouter_image_gen does not support multi-round generation")
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# ICE-Bench
2+
3+
This task folder provides a lightweight ICE-Bench integration path for smoke validation.
4+
5+
- Task: `ice_bench`
6+
- Source: official ICE-Bench dataset payload format (`ali-vilab/ICE-Bench`)
7+
- Dataset file expected by YAML: `/tmp/ice_bench_smoke.jsonl`
8+
9+
`examples/models/openrouter_ice_smoke.sh` can bootstrap one official sample into that file and run end-to-end image generation/editing smoke with local artifact saving.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
dataset_path: json
2+
dataset_kwargs:
3+
data_files:
4+
train: /tmp/ice_bench_smoke.jsonl
5+
6+
task: "ice_bench"
7+
test_split: train
8+
output_type: generate_until
9+
10+
doc_to_visual: !function utils.ice_doc_to_visual
11+
doc_to_text: !function utils.ice_doc_to_text
12+
doc_to_target: !function utils.ice_doc_to_target
13+
14+
process_results: !function utils.ice_process_results
15+
16+
metric_list:
17+
- metric: artifact_saved
18+
aggregation: mean
19+
higher_is_better: true
20+
21+
metadata:
22+
- version: 0.1
23+
description: "ICE-Bench single-sample smoke using official dataset payload"

lmms_eval/tasks/ice_bench/utils.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from __future__ import annotations
2+
3+
import json
4+
import os
5+
from typing import Any
6+
7+
from PIL import Image
8+
9+
10+
def ice_doc_to_visual(doc: dict[str, Any]) -> list[Image.Image]:
11+
src = doc.get("source_image", "")
12+
if isinstance(src, str) and src and os.path.exists(src):
13+
return [Image.open(src).convert("RGB")]
14+
return []
15+
16+
17+
def ice_doc_to_text(doc: dict[str, Any], lmms_eval_specific_kwargs: dict[str, Any] | None = None) -> str:
18+
instruction = str(doc.get("instruction", "")).strip()
19+
if lmms_eval_specific_kwargs:
20+
pre_prompt = str(lmms_eval_specific_kwargs.get("pre_prompt", ""))
21+
post_prompt = str(lmms_eval_specific_kwargs.get("post_prompt", ""))
22+
return f"{pre_prompt}{instruction}{post_prompt}"
23+
return instruction
24+
25+
26+
def ice_doc_to_target(doc: dict[str, Any]) -> str:
27+
return str(doc.get("instruction", ""))
28+
29+
30+
def ice_process_results(doc: dict[str, Any], results: list[str]) -> dict[str, float]:
31+
if not results:
32+
return {"artifact_saved": 0.0}
33+
34+
raw = results[0]
35+
try:
36+
parsed = json.loads(raw)
37+
except (json.JSONDecodeError, TypeError):
38+
return {"artifact_saved": 0.0}
39+
40+
images = parsed.get("images", []) if isinstance(parsed, dict) else []
41+
if not isinstance(images, list) or not images:
42+
return {"artifact_saved": 0.0}
43+
44+
first = images[0]
45+
if isinstance(first, str) and os.path.exists(first):
46+
return {"artifact_saved": 1.0}
47+
return {"artifact_saved": 0.0}

0 commit comments

Comments
 (0)