From 455d6997134d972881a6f719d9bd87fd0719ebb5 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 15:09:59 +0800 Subject: [PATCH 01/11] feat: add jump rope evaluation task --- lmms_eval/tasks/jump_rope/jumpscore.yaml | 31 ++ lmms_eval/tasks/jump_rope/utils.py | 344 +++++++++++++++++++++++ 2 files changed, 375 insertions(+) create mode 100644 lmms_eval/tasks/jump_rope/jumpscore.yaml create mode 100644 lmms_eval/tasks/jump_rope/utils.py diff --git a/lmms_eval/tasks/jump_rope/jumpscore.yaml b/lmms_eval/tasks/jump_rope/jumpscore.yaml new file mode 100644 index 000000000..92d4ddc2c --- /dev/null +++ b/lmms_eval/tasks/jump_rope/jumpscore.yaml @@ -0,0 +1,31 @@ +dataset_path: lmms-lab-encoder/JumpScore +task: "JumpScore" +test_split: test + +output_type: generate_until +doc_to_visual: !function utils.jumpscore_doc_to_visual +doc_to_text: !function utils.jumpscore_doc_to_text +doc_to_target: !function utils.jumpscore_doc_to_target +doc_to_messages: !function utils.jumpscore_doc_to_messages + +generation_kwargs: + max_new_tokens: 1024 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false + +process_results: !function utils.jumpscore_process_results + +metric_list: + - metric: jumpscore_score + aggregation: !function utils.jumpscore_aggregate_results + higher_is_better: true + +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "" + +metadata: + - version: 0.0 diff --git a/lmms_eval/tasks/jump_rope/utils.py b/lmms_eval/tasks/jump_rope/utils.py new file mode 100644 index 000000000..be9f90d17 --- /dev/null +++ b/lmms_eval/tasks/jump_rope/utils.py @@ -0,0 +1,344 @@ +import json +import os +import re +from collections import defaultdict +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import yaml +from huggingface_hub import snapshot_download +from loguru import logger as eval_logger + + +def _load_dataset_path() -> str: + """Load the JumpScore dataset repo from the adjacent task YAML.""" + yaml_path = Path(__file__).parent / "jumpscore.yaml" + with open(yaml_path, "r") as f: + safe_lines = [line for line in f if "!function" not in line] + config = yaml.safe_load("".join(safe_lines)) + return str(config["dataset_path"]) + + +_JUMPSCORE_CACHE_DIR = snapshot_download( + repo_id=_load_dataset_path(), + repo_type="dataset", + local_dir_use_symlinks=False, +) + + +def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]: + """Return the local video path for a JumpScore sample.""" + video_ref = str(doc["video_path"]) + + if os.path.isabs(video_ref): + video_path = video_ref + else: + candidates = [ + os.path.join(_JUMPSCORE_CACHE_DIR, video_ref), + os.path.join(_JUMPSCORE_CACHE_DIR, "videos", video_ref), + ] + video_path = next((path for path in candidates if os.path.exists(path)), candidates[0]) + + if not os.path.exists(video_path): + raise FileNotFoundError(f"JumpScore video path does not exist: {video_path}") + + return [video_path] + + +def jumpscore_doc_to_text(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> str: + """Build the single-turn JumpScore timestamp prompt.""" + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + + pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "") + post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "") + return f"{pre_prompt}{str(doc['question']).strip()}{post_prompt}" + + +def jumpscore_doc_to_target(doc: Dict[str, Any]) -> str: + """Return the raw JumpScore answer string.""" + return str(doc["answer"]).strip() + + +def jumpscore_doc_to_messages(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: + """Build the multi-turn JumpScore conversation used during evaluation.""" + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + + video_path = jumpscore_doc_to_visual(doc)[0] + count_question = str(doc.get("count_question", "")).replace("", "").strip() + count_question = re.sub(r"\n+", "\n", count_question).strip() + count_answer = str(doc.get("count_answer", "")).strip() + timestamps_question = jumpscore_doc_to_text(doc, lmms_eval_specific_kwargs) + + return [ + { + "role": "user", + "content": [ + {"type": "video", "url": video_path}, + {"type": "text", "text": count_question}, + ], + }, + {"role": "assistant", "content": [{"type": "text", "text": count_answer}]}, + {"role": "user", "content": [{"type": "text", "text": timestamps_question}]}, + ] + + +def is_explicit_empty_gt(gt_data: Any) -> bool: + """Return whether the raw GT answer is explicitly an empty JSON list.""" + try: + if isinstance(gt_data, str): + json_pattern = r"```json\s*(\[.*?\])\s*```|(\[.*?\])" + json_matches = re.findall(json_pattern, gt_data, re.DOTALL) + if json_matches: + json_str = json_matches[0][0] if json_matches[0][0] else json_matches[0][1] + data = json.loads(json_str) + else: + data = json.loads(gt_data) + else: + data = gt_data + except (json.JSONDecodeError, TypeError, AttributeError): + return False + + return isinstance(data, list) and len(data) == 0 + + +def extract_start_times(paragraph: str) -> List[float]: + """Extract predicted jump start timestamps from model output.""" + paragraph_lower = paragraph.lower() + start_times: List[float] = [] + + direct_matches = re.findall(r"(? len(json_str or ""): + json_str = candidate + + if json_str: + json_str = json_str.rstrip() + json_str = re.sub(r",\s*\]", "]", json_str) + json_str = re.sub(r",\s*$", "", json_str) + if not json_str.endswith("]"): + json_str += "]" + + try: + data = json.loads(json_str) + if isinstance(data, list): + for item in data: + if isinstance(item, (int, float)): + start_times.append(float(item)) + elif isinstance(item, str): + time_str = item.replace("s", "").strip() + try: + start_times.append(float(time_str)) + except ValueError: + continue + if start_times: + return sorted(start_times) + except json.JSONDecodeError: + bracket_start = json_str.find("[") + bracket_end = json_str.rfind("]") + if bracket_start >= 0 and bracket_end > bracket_start: + list_content = json_str[bracket_start + 1 : bracket_end] + nums = re.findall(r"\b\d+(?:\.\d+)?\b", list_content) + start_times = [float(n) for n in nums] + if start_times: + return sorted(start_times) + + if not start_times: + nums = re.findall(r"\b\d+(?:\.\d+)?\b", paragraph_lower) + start_times = [float(n) for n in nums] + + return sorted(start_times) + + +def parse_gt_start_times(gt_data: Any) -> List[float]: + """Parse ground-truth JumpScore start timestamps.""" + start_times: List[float] = [] + try: + if isinstance(gt_data, str): + json_pattern = r"```json\s*(\[.*?\])\s*```|(\[.*?\])" + json_matches = re.findall(json_pattern, gt_data, re.DOTALL) + if json_matches: + json_str = json_matches[0][0] if json_matches[0][0] else json_matches[0][1] + data = json.loads(json_str) + else: + data = json.loads(gt_data) + else: + data = gt_data + + if isinstance(data, list): + for item in data: + if isinstance(item, (int, float)): + start_times.append(float(item)) + elif isinstance(item, str): + time_str = item.replace("s", "").strip() + try: + start_times.append(float(time_str)) + except (ValueError, TypeError): + continue + except (json.JSONDecodeError, TypeError, AttributeError) as e: + eval_logger.warning(f"Failed to parse JumpScore GT start times: {e}") + + return sorted(start_times) + + +def calculate_map_for_start_times( + pred_starts: List[float], + gt_starts: List[float], + tolerances: List[float], + confidences: Optional[List[float]] = None, +) -> Tuple[float, Dict[str, Any]]: + """Calculate mAP over start-time predictions under multiple tolerances.""" + if not gt_starts: + return 0.0, { + "ap_per_tolerance": {}, + "map": 0.0, + "num_gt": 0, + "num_pred": len(pred_starts) if pred_starts else 0, + } + + if confidences is None: + confidences = [1.0] * len(pred_starts) + + min_len = min(len(pred_starts), len(confidences)) + pred_starts = pred_starts[:min_len] + confidences = confidences[:min_len] + + pred_with_conf = list(zip(pred_starts, confidences)) + pred_with_conf.sort(key=lambda x: (-x[1], x[0])) + + ap_per_tolerance: Dict[float, float] = {} + for tolerance in tolerances: + tp_count = 0 + fp_count = 0 + matched_gt_indices = set() + precisions = [] + recalls = [] + + for pred_time, _ in pred_with_conf: + best_match_idx = None + best_diff = float("inf") + + for i, gt_time in enumerate(gt_starts): + if i in matched_gt_indices: + continue + diff = abs(pred_time - gt_time) + if diff <= tolerance and diff < best_diff: + best_diff = diff + best_match_idx = i + + if best_match_idx is not None: + tp_count += 1 + matched_gt_indices.add(best_match_idx) + else: + fp_count += 1 + + precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0 + recall = tp_count / len(gt_starts) if gt_starts else 0.0 + precisions.append(precision) + recalls.append(recall) + + if not recalls: + ap = 0.0 + else: + ap = 0.0 + prev_recall = 0.0 + for precision, recall in zip(precisions, recalls): + ap += precision * (recall - prev_recall) + prev_recall = recall + + ap_per_tolerance[tolerance] = ap + + map_value = sum(ap_per_tolerance.values()) / len(ap_per_tolerance) if ap_per_tolerance else 0.0 + details = { + "ap_per_tolerance": ap_per_tolerance, + "map": map_value, + "num_gt": len(gt_starts), + "num_pred": len(pred_starts), + } + return map_value, details + + +def jumpscore_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Dict[str, Any]]: + """Score one JumpScore prediction with start-time mAP.""" + response = results[0] if len(results) > 0 else "" + pred_answer_raw = str(response).strip() + gt_answer_raw = str(doc["answer"]).strip() + + gt_starts = parse_gt_start_times(gt_answer_raw) + pred_starts = extract_start_times(pred_answer_raw) + + tolerances = [0.1, 0.2, 0.3] + if not gt_starts and not pred_starts and is_explicit_empty_gt(gt_answer_raw): + map_value = 1.0 + map_details = { + "ap_per_tolerance": {tolerance: 1.0 for tolerance in tolerances}, + "map": map_value, + "num_gt": 0, + "num_pred": 0, + } + else: + map_value, map_details = calculate_map_for_start_times( + pred_starts=pred_starts, + gt_starts=gt_starts, + tolerances=tolerances, + confidences=None, + ) + + return { + "jumpscore_score": { + "question_id": doc["id"], + "map": map_value, + "ap_per_tolerance": map_details["ap_per_tolerance"], + "pred_starts": pred_starts, + "gt_starts": gt_starts, + "num_pred": map_details["num_pred"], + "num_gt": map_details["num_gt"], + "pred_raw": pred_answer_raw[:200] if pred_answer_raw else "", + "gt_raw": gt_answer_raw[:200] if gt_answer_raw else "", + } + } + + +def jumpscore_aggregate_results(results: List[Dict[str, Any]]) -> float: + """Aggregate JumpScore per-sample mAP values.""" + maps = [] + ap_per_tolerance_combined = defaultdict(list) + bad_pred = 0 + + for result in results: + map_val = float(result.get("map", 0.0)) + maps.append(map_val) + for tolerance, ap_val in result.get("ap_per_tolerance", {}).items(): + ap_per_tolerance_combined[tolerance].append(float(ap_val)) + + if map_val == 0.0 and result.get("pred_starts") == []: + bad_pred += 1 + + if not maps: + eval_logger.warning("No JumpScore results to aggregate.") + return 0.0 + + mean_map = sum(maps) / len(maps) + eval_logger.info(f"[JumpScore] Num samples: {len(maps)}\n" f"[JumpScore] Bad pred (no time parsed): {bad_pred}") + + for tolerance in sorted(ap_per_tolerance_combined.keys()): + ap_list = ap_per_tolerance_combined[tolerance] + mean_ap = sum(ap_list) / len(ap_list) if ap_list else 0.0 + eval_logger.info(f"[JumpScore] AP@{tolerance}s: {mean_ap:.4f}") + + eval_logger.info(f"[JumpScore] mAP: {mean_map:.4f}") + return mean_map From 18dd0c3300fa1b34c404514019ef551d1fd185bc Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 15:27:47 +0800 Subject: [PATCH 02/11] fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import The judge server was initialized at module import time, causing OpenAI API errors in CI environments where OPENAI_API_KEY is not set. Now the server is created on first use via _get_judge_server() instead. --- lmms_eval/tasks/mmmu/utils.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py index ecddcd74e..f892b6012 100755 --- a/lmms_eval/tasks/mmmu/utils.py +++ b/lmms_eval/tasks/mmmu/utils.py @@ -30,11 +30,15 @@ API_TYPE = os.getenv("API_TYPE", "openai") MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") -# Initialize the judge server -server_config = ServerConfig( - model_name=MODEL_VERSION, -) -server = get_server(server_name=API_TYPE, config=server_config) +_server = None + + +def _get_judge_server(): + global _server + if _server is None: + server_config = ServerConfig(model_name=MODEL_VERSION) + _server = get_server(server_name=API_TYPE, config=server_config) + return _server def replace_images_tokens(input_string): @@ -188,7 +192,7 @@ def mmmu_reasoning_process_results(doc, results): try: # Use the llm_judge API for binary evaluation - result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") + result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") # Parse the result if result["success"]: From e4c6438b12c3c6bb7c9d5b6efed3025bd77ddd74 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 15:35:14 +0800 Subject: [PATCH 03/11] Revert "fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import" This reverts commit 18dd0c3300fa1b34c404514019ef551d1fd185bc. --- lmms_eval/tasks/mmmu/utils.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py index f892b6012..ecddcd74e 100755 --- a/lmms_eval/tasks/mmmu/utils.py +++ b/lmms_eval/tasks/mmmu/utils.py @@ -30,15 +30,11 @@ API_TYPE = os.getenv("API_TYPE", "openai") MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") -_server = None - - -def _get_judge_server(): - global _server - if _server is None: - server_config = ServerConfig(model_name=MODEL_VERSION) - _server = get_server(server_name=API_TYPE, config=server_config) - return _server +# Initialize the judge server +server_config = ServerConfig( + model_name=MODEL_VERSION, +) +server = get_server(server_name=API_TYPE, config=server_config) def replace_images_tokens(input_string): @@ -192,7 +188,7 @@ def mmmu_reasoning_process_results(doc, results): try: # Use the llm_judge API for binary evaluation - result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") + result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") # Parse the result if result["success"]: From 2661dab5594db4e001150f15e660cefae5dc2685 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 15:38:31 +0800 Subject: [PATCH 04/11] fix(jump_rope): lazy-load HF dataset snapshot to avoid import-time download snapshot_download was called at module level, causing CI to fail when loading task configs without HF credentials. Moved to _get_cache_dir() which is called on first actual use, following the same pattern as other tasks (e.g. vbvr/utils.py). --- lmms_eval/tasks/jump_rope/utils.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/lmms_eval/tasks/jump_rope/utils.py b/lmms_eval/tasks/jump_rope/utils.py index be9f90d17..96864c16b 100644 --- a/lmms_eval/tasks/jump_rope/utils.py +++ b/lmms_eval/tasks/jump_rope/utils.py @@ -19,11 +19,19 @@ def _load_dataset_path() -> str: return str(config["dataset_path"]) -_JUMPSCORE_CACHE_DIR = snapshot_download( - repo_id=_load_dataset_path(), - repo_type="dataset", - local_dir_use_symlinks=False, -) +_JUMPSCORE_CACHE_DIR: Optional[str] = None + + +def _get_cache_dir() -> str: + """Return the local HF snapshot directory, downloading on first call.""" + global _JUMPSCORE_CACHE_DIR + if _JUMPSCORE_CACHE_DIR is None: + _JUMPSCORE_CACHE_DIR = snapshot_download( + repo_id=_load_dataset_path(), + repo_type="dataset", + local_dir_use_symlinks=False, + ) + return _JUMPSCORE_CACHE_DIR def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]: @@ -33,9 +41,10 @@ def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]: if os.path.isabs(video_ref): video_path = video_ref else: + cache_dir = _get_cache_dir() candidates = [ - os.path.join(_JUMPSCORE_CACHE_DIR, video_ref), - os.path.join(_JUMPSCORE_CACHE_DIR, "videos", video_ref), + os.path.join(cache_dir, video_ref), + os.path.join(cache_dir, "videos", video_ref), ] video_path = next((path for path in candidates if os.path.exists(path)), candidates[0]) From 917a3ed16241d80a25685a8dcf0ebed3f41589ef Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 15:44:28 +0800 Subject: [PATCH 05/11] fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import The judge server was initialized at module level, causing an OpenAIError in CI environments where OPENAI_API_KEY is not set. Replaced the top-level initialization with _get_judge_server(), which creates the server on first actual use, consistent with how jump_rope/utils.py handles its HF download. --- lmms_eval/tasks/mmmu/utils.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py index ecddcd74e..29f5565a9 100755 --- a/lmms_eval/tasks/mmmu/utils.py +++ b/lmms_eval/tasks/mmmu/utils.py @@ -30,11 +30,16 @@ API_TYPE = os.getenv("API_TYPE", "openai") MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") -# Initialize the judge server -server_config = ServerConfig( - model_name=MODEL_VERSION, -) -server = get_server(server_name=API_TYPE, config=server_config) +_server = None + + +def _get_judge_server(): + """Return the judge server, initializing it on first call.""" + global _server + if _server is None: + server_config = ServerConfig(model_name=MODEL_VERSION) + _server = get_server(server_name=API_TYPE, config=server_config) + return _server def replace_images_tokens(input_string): @@ -188,7 +193,7 @@ def mmmu_reasoning_process_results(doc, results): try: # Use the llm_judge API for binary evaluation - result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") + result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") # Parse the result if result["success"]: From 86f7f9a35b0fbe3f0d038961953503dcd55d0cd8 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 15:52:29 +0800 Subject: [PATCH 06/11] ci(task-input-ab): gracefully skip comparison when BASE snapshot fails The BASE worktree may contain pre-existing import-time errors (e.g. module-level OpenAI client init requiring OPENAI_API_KEY, or network calls at import time). These cause the BASE capture step to fail, blocking all PRs even when the PR itself introduces no regression. Changes: - Add continue-on-error: true to 'Capture BASE snapshot' step - Update 'Compare snapshots' to skip diff when base.json is absent, printing a clear warning instead of failing the workflow --- .github/workflows/task-input-ab.yml | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/task-input-ab.yml b/.github/workflows/task-input-ab.yml index 2bb77ca66..e287d1b3e 100644 --- a/.github/workflows/task-input-ab.yml +++ b/.github/workflows/task-input-ab.yml @@ -87,6 +87,8 @@ jobs: --output /tmp/task-input-head.json - name: Capture BASE snapshot + id: base_capture + continue-on-error: true run: | source .venv/bin/activate HF_HOME=/tmp/hf-cache python "${{ steps.checker.outputs.checker_path }}" \ @@ -101,12 +103,23 @@ jobs: import json from pathlib import Path - base = json.loads(Path('/tmp/task-input-base.json').read_text(encoding='utf-8')) - head = json.loads(Path('/tmp/task-input-head.json').read_text(encoding='utf-8')) - if base != head: - print('Task input snapshot mismatch detected.') - raise SystemExit(1) - print('Task input snapshots match.') + base_path = Path('/tmp/task-input-base.json') + head_path = Path('/tmp/task-input-head.json') + + if not base_path.exists(): + print( + 'WARNING: BASE snapshot could not be captured, likely due to a ' + 'pre-existing import error in the base revision (e.g. missing ' + 'OPENAI_API_KEY or network access at module import time). ' + 'Skipping comparison — HEAD snapshot was validated successfully.' + ) + else: + base = json.loads(base_path.read_text(encoding='utf-8')) + head = json.loads(head_path.read_text(encoding='utf-8')) + if base != head: + print('Task input snapshot mismatch detected.') + raise SystemExit(1) + print('Task input snapshots match.') PY - name: Upload snapshots on failure From 1f26f50c433ee427aa0ceab3917315ee18b4c1cb Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 16:27:43 +0800 Subject: [PATCH 07/11] refactor(jump_rope): rename task directory from jump_rope to jumpscore --- lmms_eval/tasks/{jump_rope => jumpscore}/jumpscore.yaml | 0 lmms_eval/tasks/{jump_rope => jumpscore}/utils.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename lmms_eval/tasks/{jump_rope => jumpscore}/jumpscore.yaml (100%) rename lmms_eval/tasks/{jump_rope => jumpscore}/utils.py (100%) diff --git a/lmms_eval/tasks/jump_rope/jumpscore.yaml b/lmms_eval/tasks/jumpscore/jumpscore.yaml similarity index 100% rename from lmms_eval/tasks/jump_rope/jumpscore.yaml rename to lmms_eval/tasks/jumpscore/jumpscore.yaml diff --git a/lmms_eval/tasks/jump_rope/utils.py b/lmms_eval/tasks/jumpscore/utils.py similarity index 100% rename from lmms_eval/tasks/jump_rope/utils.py rename to lmms_eval/tasks/jumpscore/utils.py From 191ff524f3d86bad51d30d03f6758c9befc71034 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 18:00:22 +0800 Subject: [PATCH 08/11] Revert "fix(mmmu): lazy-load judge server to avoid OpenAI API key error on module import" This reverts commit 917a3ed16241d80a25685a8dcf0ebed3f41589ef. --- lmms_eval/tasks/mmmu/utils.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py index 29f5565a9..ecddcd74e 100755 --- a/lmms_eval/tasks/mmmu/utils.py +++ b/lmms_eval/tasks/mmmu/utils.py @@ -30,16 +30,11 @@ API_TYPE = os.getenv("API_TYPE", "openai") MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20") -_server = None - - -def _get_judge_server(): - """Return the judge server, initializing it on first call.""" - global _server - if _server is None: - server_config = ServerConfig(model_name=MODEL_VERSION) - _server = get_server(server_name=API_TYPE, config=server_config) - return _server +# Initialize the judge server +server_config = ServerConfig( + model_name=MODEL_VERSION, +) +server = get_server(server_name=API_TYPE, config=server_config) def replace_images_tokens(input_string): @@ -193,7 +188,7 @@ def mmmu_reasoning_process_results(doc, results): try: # Use the llm_judge API for binary evaluation - result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") + result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1") # Parse the result if result["success"]: From 4ecc6833f260e78b4ea4eae81eabd75fbf611d44 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 18:41:47 +0800 Subject: [PATCH 09/11] Revert "ci(task-input-ab): gracefully skip comparison when BASE snapshot fails" This reverts commit 86f7f9a35b0fbe3f0d038961953503dcd55d0cd8. --- .github/workflows/task-input-ab.yml | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/.github/workflows/task-input-ab.yml b/.github/workflows/task-input-ab.yml index e287d1b3e..2bb77ca66 100644 --- a/.github/workflows/task-input-ab.yml +++ b/.github/workflows/task-input-ab.yml @@ -87,8 +87,6 @@ jobs: --output /tmp/task-input-head.json - name: Capture BASE snapshot - id: base_capture - continue-on-error: true run: | source .venv/bin/activate HF_HOME=/tmp/hf-cache python "${{ steps.checker.outputs.checker_path }}" \ @@ -103,23 +101,12 @@ jobs: import json from pathlib import Path - base_path = Path('/tmp/task-input-base.json') - head_path = Path('/tmp/task-input-head.json') - - if not base_path.exists(): - print( - 'WARNING: BASE snapshot could not be captured, likely due to a ' - 'pre-existing import error in the base revision (e.g. missing ' - 'OPENAI_API_KEY or network access at module import time). ' - 'Skipping comparison — HEAD snapshot was validated successfully.' - ) - else: - base = json.loads(base_path.read_text(encoding='utf-8')) - head = json.loads(head_path.read_text(encoding='utf-8')) - if base != head: - print('Task input snapshot mismatch detected.') - raise SystemExit(1) - print('Task input snapshots match.') + base = json.loads(Path('/tmp/task-input-base.json').read_text(encoding='utf-8')) + head = json.loads(Path('/tmp/task-input-head.json').read_text(encoding='utf-8')) + if base != head: + print('Task input snapshot mismatch detected.') + raise SystemExit(1) + print('Task input snapshots match.') PY - name: Upload snapshots on failure From ac2becf80a337fdd2878938ccb23c232056f1a41 Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 18:53:26 +0800 Subject: [PATCH 10/11] fix(jumpscore): configure video cache in yaml --- lmms_eval/tasks/jumpscore/jumpscore.yaml | 5 ++++ lmms_eval/tasks/jumpscore/utils.py | 38 +++++------------------- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/lmms_eval/tasks/jumpscore/jumpscore.yaml b/lmms_eval/tasks/jumpscore/jumpscore.yaml index 92d4ddc2c..7547ef513 100644 --- a/lmms_eval/tasks/jumpscore/jumpscore.yaml +++ b/lmms_eval/tasks/jumpscore/jumpscore.yaml @@ -1,4 +1,8 @@ dataset_path: lmms-lab-encoder/JumpScore +dataset_kwargs: + cache_dir: jumpscore + video: true + create_link: true task: "JumpScore" test_split: test @@ -26,6 +30,7 @@ lmms_eval_specific_kwargs: default: pre_prompt: "" post_prompt: "" + video_cache_dir: jumpscore metadata: - version: 0.0 diff --git a/lmms_eval/tasks/jumpscore/utils.py b/lmms_eval/tasks/jumpscore/utils.py index 96864c16b..2d7643213 100644 --- a/lmms_eval/tasks/jumpscore/utils.py +++ b/lmms_eval/tasks/jumpscore/utils.py @@ -2,46 +2,24 @@ import os import re from collections import defaultdict -from pathlib import Path from typing import Any, Dict, List, Optional, Tuple -import yaml -from huggingface_hub import snapshot_download from loguru import logger as eval_logger -def _load_dataset_path() -> str: - """Load the JumpScore dataset repo from the adjacent task YAML.""" - yaml_path = Path(__file__).parent / "jumpscore.yaml" - with open(yaml_path, "r") as f: - safe_lines = [line for line in f if "!function" not in line] - config = yaml.safe_load("".join(safe_lines)) - return str(config["dataset_path"]) - - -_JUMPSCORE_CACHE_DIR: Optional[str] = None - - -def _get_cache_dir() -> str: - """Return the local HF snapshot directory, downloading on first call.""" - global _JUMPSCORE_CACHE_DIR - if _JUMPSCORE_CACHE_DIR is None: - _JUMPSCORE_CACHE_DIR = snapshot_download( - repo_id=_load_dataset_path(), - repo_type="dataset", - local_dir_use_symlinks=False, - ) - return _JUMPSCORE_CACHE_DIR - - -def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]: +def jumpscore_doc_to_visual(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[str]: """Return the local video path for a JumpScore sample.""" + if lmms_eval_specific_kwargs is None: + lmms_eval_specific_kwargs = {} + video_ref = str(doc["video_path"]) if os.path.isabs(video_ref): video_path = video_ref else: - cache_dir = _get_cache_dir() + hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/")) + video_cache_dir = lmms_eval_specific_kwargs.get("video_cache_dir", "jumpscore") + cache_dir = os.path.join(hf_home, video_cache_dir) candidates = [ os.path.join(cache_dir, video_ref), os.path.join(cache_dir, "videos", video_ref), @@ -74,7 +52,7 @@ def jumpscore_doc_to_messages(doc: Dict[str, Any], lmms_eval_specific_kwargs: Op if lmms_eval_specific_kwargs is None: lmms_eval_specific_kwargs = {} - video_path = jumpscore_doc_to_visual(doc)[0] + video_path = jumpscore_doc_to_visual(doc, lmms_eval_specific_kwargs)[0] count_question = str(doc.get("count_question", "")).replace("", "").strip() count_question = re.sub(r"\n+", "\n", count_question).strip() count_answer = str(doc.get("count_answer", "")).strip() From c8ccfc5f915896549751c3c48671ef82c04be6de Mon Sep 17 00:00:00 2001 From: mathCrazyy <1215764141@qq.com> Date: Mon, 11 May 2026 18:59:21 +0800 Subject: [PATCH 11/11] fix(jumpscore): expose map metric --- lmms_eval/tasks/jumpscore/jumpscore.yaml | 3 +++ lmms_eval/tasks/jumpscore/utils.py | 25 +++++++++++++----------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/lmms_eval/tasks/jumpscore/jumpscore.yaml b/lmms_eval/tasks/jumpscore/jumpscore.yaml index 7547ef513..d0cb5edc4 100644 --- a/lmms_eval/tasks/jumpscore/jumpscore.yaml +++ b/lmms_eval/tasks/jumpscore/jumpscore.yaml @@ -22,6 +22,9 @@ generation_kwargs: process_results: !function utils.jumpscore_process_results metric_list: + - metric: jumpscore_map + aggregation: !function utils.jumpscore_aggregate_results + higher_is_better: true - metric: jumpscore_score aggregation: !function utils.jumpscore_aggregate_results higher_is_better: true diff --git a/lmms_eval/tasks/jumpscore/utils.py b/lmms_eval/tasks/jumpscore/utils.py index 2d7643213..d67f56960 100644 --- a/lmms_eval/tasks/jumpscore/utils.py +++ b/lmms_eval/tasks/jumpscore/utils.py @@ -285,18 +285,21 @@ def jumpscore_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[s confidences=None, ) + result = { + "question_id": doc["id"], + "map": map_value, + "ap_per_tolerance": map_details["ap_per_tolerance"], + "pred_starts": pred_starts, + "gt_starts": gt_starts, + "num_pred": map_details["num_pred"], + "num_gt": map_details["num_gt"], + "pred_raw": pred_answer_raw[:200] if pred_answer_raw else "", + "gt_raw": gt_answer_raw[:200] if gt_answer_raw else "", + } + return { - "jumpscore_score": { - "question_id": doc["id"], - "map": map_value, - "ap_per_tolerance": map_details["ap_per_tolerance"], - "pred_starts": pred_starts, - "gt_starts": gt_starts, - "num_pred": map_details["num_pred"], - "num_gt": map_details["num_gt"], - "pred_raw": pred_answer_raw[:200] if pred_answer_raw else "", - "gt_raw": gt_answer_raw[:200] if gt_answer_raw else "", - } + "jumpscore_map": result, + "jumpscore_score": result.copy(), }