From 455d6997134d972881a6f719d9bd87fd0719ebb5 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 15:09:59 +0800
Subject: [PATCH 01/11] feat: add jump rope evaluation task

---
 lmms_eval/tasks/jump_rope/jumpscore.yaml |  31 ++
 lmms_eval/tasks/jump_rope/utils.py       | 344 +++++++++++++++++++++++
 2 files changed, 375 insertions(+)
 create mode 100644 lmms_eval/tasks/jump_rope/jumpscore.yaml
 create mode 100644 lmms_eval/tasks/jump_rope/utils.py

diff --git a/lmms_eval/tasks/jump_rope/jumpscore.yaml b/lmms_eval/tasks/jump_rope/jumpscore.yaml
new file mode 100644
index 000000000..92d4ddc2c
--- /dev/null
+++ b/lmms_eval/tasks/jump_rope/jumpscore.yaml
@@ -0,0 +1,31 @@
+dataset_path: lmms-lab-encoder/JumpScore
+task: "JumpScore"
+test_split: test
+
+output_type: generate_until
+doc_to_visual: !function utils.jumpscore_doc_to_visual
+doc_to_text: !function utils.jumpscore_doc_to_text
+doc_to_target: !function utils.jumpscore_doc_to_target
+doc_to_messages: !function utils.jumpscore_doc_to_messages
+
+generation_kwargs:
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+process_results: !function utils.jumpscore_process_results
+
+metric_list:
+  - metric: jumpscore_score
+    aggregation: !function utils.jumpscore_aggregate_results
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""
+
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/jump_rope/utils.py b/lmms_eval/tasks/jump_rope/utils.py
new file mode 100644
index 000000000..be9f90d17
--- /dev/null
+++ b/lmms_eval/tasks/jump_rope/utils.py
@@ -0,0 +1,344 @@
+import json
+import os
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+import yaml
+from huggingface_hub import snapshot_download
+from loguru import logger as eval_logger
+
+
+def _load_dataset_path() -> str:
+    """Load the JumpScore dataset repo from the adjacent task YAML."""
+    yaml_path = Path(__file__).parent / "jumpscore.yaml"
+    with open(yaml_path, "r") as f:
+        safe_lines = [line for line in f if "!function" not in line]
+    config = yaml.safe_load("".join(safe_lines))
+    return str(config["dataset_path"])
+
+
+_JUMPSCORE_CACHE_DIR = snapshot_download(
+    repo_id=_load_dataset_path(),
+    repo_type="dataset",
+    local_dir_use_symlinks=False,
+)
+
+
+def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]:
+    """Return the local video path for a JumpScore sample."""
+    video_ref = str(doc["video_path"])
+
+    if os.path.isabs(video_ref):
+        video_path = video_ref
+    else:
+        candidates = [
+            os.path.join(_JUMPSCORE_CACHE_DIR, video_ref),
+            os.path.join(_JUMPSCORE_CACHE_DIR, "videos", video_ref),
+        ]
+        video_path = next((path for path in candidates if os.path.exists(path)), candidates[0])
+
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"JumpScore video path does not exist: {video_path}")
+
+    return [video_path]
+
+
+def jumpscore_doc_to_text(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> str:
+    """Build the single-turn JumpScore timestamp prompt."""
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+    return f"{pre_prompt}{str(doc['question']).strip()}{post_prompt}"
+
+
+def jumpscore_doc_to_target(doc: Dict[str, Any]) -> str:
+    """Return the raw JumpScore answer string."""
+    return str(doc["answer"]).strip()
+
+
+def jumpscore_doc_to_messages(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
+    """Build the multi-turn JumpScore conversation used during evaluation."""
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
+    video_path = jumpscore_doc_to_visual(doc)[0]
+    count_question = str(doc.get("count_question", "")).replace("<image>", "").strip()
+    count_question = re.sub(r"\n+", "\n", count_question).strip()
+    count_answer = str(doc.get("count_answer", "")).strip()
+    timestamps_question = jumpscore_doc_to_text(doc, lmms_eval_specific_kwargs)
+
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "url": video_path},
+                {"type": "text", "text": count_question},
+            ],
+        },
+        {"role": "assistant", "content": [{"type": "text", "text": count_answer}]},
+        {"role": "user", "content": [{"type": "text", "text": timestamps_question}]},
+    ]
+
+
+def is_explicit_empty_gt(gt_data: Any) -> bool:
+    """Return whether the raw GT answer is explicitly an empty JSON list."""
+    try:
+        if isinstance(gt_data, str):
+            json_pattern = r"```json\s*(\[.*?\])\s*```|(\[.*?\])"
+            json_matches = re.findall(json_pattern, gt_data, re.DOTALL)
+            if json_matches:
+                json_str = json_matches[0][0] if json_matches[0][0] else json_matches[0][1]
+                data = json.loads(json_str)
+            else:
+                data = json.loads(gt_data)
+        else:
+            data = gt_data
+    except (json.JSONDecodeError, TypeError, AttributeError):
+        return False
+
+    return isinstance(data, list) and len(data) == 0
+
+
+def extract_start_times(paragraph: str) -> List[float]:
+    """Extract predicted jump start timestamps from model output."""
+    paragraph_lower = paragraph.lower()
+    start_times: List[float] = []
+
+    direct_matches = re.findall(r"(?<!\d)(\d+(?:\.\d+)?)\s*s\b", paragraph_lower)
+    if direct_matches:
+        for ts_str in direct_matches:
+            try:
+                start_times.append(float(ts_str))
+            except ValueError:
+                continue
+        return sorted(start_times)
+
+    json_pattern = r"```json\s*(\[.*?\])\s*```|(\[.*?\])"
+    json_matches = re.findall(json_pattern, paragraph_lower, re.DOTALL)
+    if json_matches:
+        json_str = None
+        for match in json_matches:
+            candidate = match[0] if match[0] else match[1]
+            if candidate and len(candidate) > len(json_str or ""):
+                json_str = candidate
+
+        if json_str:
+            json_str = json_str.rstrip()
+            json_str = re.sub(r",\s*\]", "]", json_str)
+            json_str = re.sub(r",\s*$", "", json_str)
+            if not json_str.endswith("]"):
+                json_str += "]"
+
+            try:
+                data = json.loads(json_str)
+                if isinstance(data, list):
+                    for item in data:
+                        if isinstance(item, (int, float)):
+                            start_times.append(float(item))
+                        elif isinstance(item, str):
+                            time_str = item.replace("s", "").strip()
+                            try:
+                                start_times.append(float(time_str))
+                            except ValueError:
+                                continue
+                    if start_times:
+                        return sorted(start_times)
+            except json.JSONDecodeError:
+                bracket_start = json_str.find("[")
+                bracket_end = json_str.rfind("]")
+                if bracket_start >= 0 and bracket_end > bracket_start:
+                    list_content = json_str[bracket_start + 1 : bracket_end]
+                    nums = re.findall(r"\b\d+(?:\.\d+)?\b", list_content)
+                    start_times = [float(n) for n in nums]
+                    if start_times:
+                        return sorted(start_times)
+
+    if not start_times:
+        nums = re.findall(r"\b\d+(?:\.\d+)?\b", paragraph_lower)
+        start_times = [float(n) for n in nums]
+
+    return sorted(start_times)
+
+
+def parse_gt_start_times(gt_data: Any) -> List[float]:
+    """Parse ground-truth JumpScore start timestamps."""
+    start_times: List[float] = []
+    try:
+        if isinstance(gt_data, str):
+            json_pattern = r"```json\s*(\[.*?\])\s*```|(\[.*?\])"
+            json_matches = re.findall(json_pattern, gt_data, re.DOTALL)
+            if json_matches:
+                json_str = json_matches[0][0] if json_matches[0][0] else json_matches[0][1]
+                data = json.loads(json_str)
+            else:
+                data = json.loads(gt_data)
+        else:
+            data = gt_data
+
+        if isinstance(data, list):
+            for item in data:
+                if isinstance(item, (int, float)):
+                    start_times.append(float(item))
+                elif isinstance(item, str):
+                    time_str = item.replace("s", "").strip()
+                    try:
+                        start_times.append(float(time_str))
+                    except (ValueError, TypeError):
+                        continue
+    except (json.JSONDecodeError, TypeError, AttributeError) as e:
+        eval_logger.warning(f"Failed to parse JumpScore GT start times: {e}")
+
+    return sorted(start_times)
+
+
+def calculate_map_for_start_times(
+    pred_starts: List[float],
+    gt_starts: List[float],
+    tolerances: List[float],
+    confidences: Optional[List[float]] = None,
+) -> Tuple[float, Dict[str, Any]]:
+    """Calculate mAP over start-time predictions under multiple tolerances."""
+    if not gt_starts:
+        return 0.0, {
+            "ap_per_tolerance": {},
+            "map": 0.0,
+            "num_gt": 0,
+            "num_pred": len(pred_starts) if pred_starts else 0,
+        }
+
+    if confidences is None:
+        confidences = [1.0] * len(pred_starts)
+
+    min_len = min(len(pred_starts), len(confidences))
+    pred_starts = pred_starts[:min_len]
+    confidences = confidences[:min_len]
+
+    pred_with_conf = list(zip(pred_starts, confidences))
+    pred_with_conf.sort(key=lambda x: (-x[1], x[0]))
+
+    ap_per_tolerance: Dict[float, float] = {}
+    for tolerance in tolerances:
+        tp_count = 0
+        fp_count = 0
+        matched_gt_indices = set()
+        precisions = []
+        recalls = []
+
+        for pred_time, _ in pred_with_conf:
+            best_match_idx = None
+            best_diff = float("inf")
+
+            for i, gt_time in enumerate(gt_starts):
+                if i in matched_gt_indices:
+                    continue
+                diff = abs(pred_time - gt_time)
+                if diff <= tolerance and diff < best_diff:
+                    best_diff = diff
+                    best_match_idx = i
+
+            if best_match_idx is not None:
+                tp_count += 1
+                matched_gt_indices.add(best_match_idx)
+            else:
+                fp_count += 1
+
+            precision = tp_count / (tp_count + fp_count) if (tp_count + fp_count) > 0 else 0.0
+            recall = tp_count / len(gt_starts) if gt_starts else 0.0
+            precisions.append(precision)
+            recalls.append(recall)
+
+        if not recalls:
+            ap = 0.0
+        else:
+            ap = 0.0
+            prev_recall = 0.0
+            for precision, recall in zip(precisions, recalls):
+                ap += precision * (recall - prev_recall)
+                prev_recall = recall
+
+        ap_per_tolerance[tolerance] = ap
+
+    map_value = sum(ap_per_tolerance.values()) / len(ap_per_tolerance) if ap_per_tolerance else 0.0
+    details = {
+        "ap_per_tolerance": ap_per_tolerance,
+        "map": map_value,
+        "num_gt": len(gt_starts),
+        "num_pred": len(pred_starts),
+    }
+    return map_value, details
+
+
+def jumpscore_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[str, Dict[str, Any]]:
+    """Score one JumpScore prediction with start-time mAP."""
+    response = results[0] if len(results) > 0 else ""
+    pred_answer_raw = str(response).strip()
+    gt_answer_raw = str(doc["answer"]).strip()
+
+    gt_starts = parse_gt_start_times(gt_answer_raw)
+    pred_starts = extract_start_times(pred_answer_raw)
+
+    tolerances = [0.1, 0.2, 0.3]
+    if not gt_starts and not pred_starts and is_explicit_empty_gt(gt_answer_raw):
+        map_value = 1.0
+        map_details = {
+            "ap_per_tolerance": {tolerance: 1.0 for tolerance in tolerances},
+            "map": map_value,
+            "num_gt": 0,
+            "num_pred": 0,
+        }
+    else:
+        map_value, map_details = calculate_map_for_start_times(
+            pred_starts=pred_starts,
+            gt_starts=gt_starts,
+            tolerances=tolerances,
+            confidences=None,
+        )
+
+    return {
+        "jumpscore_score": {
+            "question_id": doc["id"],
+            "map": map_value,
+            "ap_per_tolerance": map_details["ap_per_tolerance"],
+            "pred_starts": pred_starts,
+            "gt_starts": gt_starts,
+            "num_pred": map_details["num_pred"],
+            "num_gt": map_details["num_gt"],
+            "pred_raw": pred_answer_raw[:200] if pred_answer_raw else "",
+            "gt_raw": gt_answer_raw[:200] if gt_answer_raw else "",
+        }
+    }
+
+
+def jumpscore_aggregate_results(results: List[Dict[str, Any]]) -> float:
+    """Aggregate JumpScore per-sample mAP values."""
+    maps = []
+    ap_per_tolerance_combined = defaultdict(list)
+    bad_pred = 0
+
+    for result in results:
+        map_val = float(result.get("map", 0.0))
+        maps.append(map_val)
+        for tolerance, ap_val in result.get("ap_per_tolerance", {}).items():
+            ap_per_tolerance_combined[tolerance].append(float(ap_val))
+
+        if map_val == 0.0 and result.get("pred_starts") == []:
+            bad_pred += 1
+
+    if not maps:
+        eval_logger.warning("No JumpScore results to aggregate.")
+        return 0.0
+
+    mean_map = sum(maps) / len(maps)
+    eval_logger.info(f"[JumpScore] Num samples: {len(maps)}\n" f"[JumpScore] Bad pred (no time parsed): {bad_pred}")
+
+    for tolerance in sorted(ap_per_tolerance_combined.keys()):
+        ap_list = ap_per_tolerance_combined[tolerance]
+        mean_ap = sum(ap_list) / len(ap_list) if ap_list else 0.0
+        eval_logger.info(f"[JumpScore] AP@{tolerance}s: {mean_ap:.4f}")
+
+    eval_logger.info(f"[JumpScore] mAP: {mean_map:.4f}")
+    return mean_map

From 18dd0c3300fa1b34c404514019ef551d1fd185bc Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 15:27:47 +0800
Subject: [PATCH 02/11] fix(mmmu): lazy-load judge server to avoid OpenAI API
 key error on module import

The judge server was initialized at module import time, causing
OpenAI API errors in CI environments where OPENAI_API_KEY is not set.
Now the server is created on first use via _get_judge_server() instead.
---
 lmms_eval/tasks/mmmu/utils.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
index ecddcd74e..f892b6012 100755
--- a/lmms_eval/tasks/mmmu/utils.py
+++ b/lmms_eval/tasks/mmmu/utils.py
@@ -30,11 +30,15 @@
 API_TYPE = os.getenv("API_TYPE", "openai")
 MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
 
-# Initialize the judge server
-server_config = ServerConfig(
-    model_name=MODEL_VERSION,
-)
-server = get_server(server_name=API_TYPE, config=server_config)
+_server = None
+
+
+def _get_judge_server():
+    global _server
+    if _server is None:
+        server_config = ServerConfig(model_name=MODEL_VERSION)
+        _server = get_server(server_name=API_TYPE, config=server_config)
+    return _server
 
 
 def replace_images_tokens(input_string):
@@ -188,7 +192,7 @@ def mmmu_reasoning_process_results(doc, results):
 
         try:
             # Use the llm_judge API for binary evaluation
-            result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
+            result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
 
             # Parse the result
             if result["success"]:

From e4c6438b12c3c6bb7c9d5b6efed3025bd77ddd74 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 15:35:14 +0800
Subject: [PATCH 03/11] Revert "fix(mmmu): lazy-load judge server to avoid
 OpenAI API key error on module import"

This reverts commit 18dd0c3300fa1b34c404514019ef551d1fd185bc.
---
 lmms_eval/tasks/mmmu/utils.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
index f892b6012..ecddcd74e 100755
--- a/lmms_eval/tasks/mmmu/utils.py
+++ b/lmms_eval/tasks/mmmu/utils.py
@@ -30,15 +30,11 @@
 API_TYPE = os.getenv("API_TYPE", "openai")
 MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
 
-_server = None
-
-
-def _get_judge_server():
-    global _server
-    if _server is None:
-        server_config = ServerConfig(model_name=MODEL_VERSION)
-        _server = get_server(server_name=API_TYPE, config=server_config)
-    return _server
+# Initialize the judge server
+server_config = ServerConfig(
+    model_name=MODEL_VERSION,
+)
+server = get_server(server_name=API_TYPE, config=server_config)
 
 
 def replace_images_tokens(input_string):
@@ -192,7 +188,7 @@ def mmmu_reasoning_process_results(doc, results):
 
         try:
             # Use the llm_judge API for binary evaluation
-            result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
+            result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
 
             # Parse the result
             if result["success"]:

From 2661dab5594db4e001150f15e660cefae5dc2685 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 15:38:31 +0800
Subject: [PATCH 04/11] fix(jump_rope): lazy-load HF dataset snapshot to avoid
 import-time download

snapshot_download was called at module level, causing CI to fail when
loading task configs without HF credentials. Moved to _get_cache_dir()
which is called on first actual use, following the same pattern as
other tasks (e.g. vbvr/utils.py).
---
 lmms_eval/tasks/jump_rope/utils.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/lmms_eval/tasks/jump_rope/utils.py b/lmms_eval/tasks/jump_rope/utils.py
index be9f90d17..96864c16b 100644
--- a/lmms_eval/tasks/jump_rope/utils.py
+++ b/lmms_eval/tasks/jump_rope/utils.py
@@ -19,11 +19,19 @@ def _load_dataset_path() -> str:
     return str(config["dataset_path"])
 
 
-_JUMPSCORE_CACHE_DIR = snapshot_download(
-    repo_id=_load_dataset_path(),
-    repo_type="dataset",
-    local_dir_use_symlinks=False,
-)
+_JUMPSCORE_CACHE_DIR: Optional[str] = None
+
+
+def _get_cache_dir() -> str:
+    """Return the local HF snapshot directory, downloading on first call."""
+    global _JUMPSCORE_CACHE_DIR
+    if _JUMPSCORE_CACHE_DIR is None:
+        _JUMPSCORE_CACHE_DIR = snapshot_download(
+            repo_id=_load_dataset_path(),
+            repo_type="dataset",
+            local_dir_use_symlinks=False,
+        )
+    return _JUMPSCORE_CACHE_DIR
 
 
 def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]:
@@ -33,9 +41,10 @@ def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]:
     if os.path.isabs(video_ref):
         video_path = video_ref
     else:
+        cache_dir = _get_cache_dir()
         candidates = [
-            os.path.join(_JUMPSCORE_CACHE_DIR, video_ref),
-            os.path.join(_JUMPSCORE_CACHE_DIR, "videos", video_ref),
+            os.path.join(cache_dir, video_ref),
+            os.path.join(cache_dir, "videos", video_ref),
         ]
         video_path = next((path for path in candidates if os.path.exists(path)), candidates[0])
 

From 917a3ed16241d80a25685a8dcf0ebed3f41589ef Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 15:44:28 +0800
Subject: [PATCH 05/11] fix(mmmu): lazy-load judge server to avoid OpenAI API
 key error on module import

The judge server was initialized at module level, causing an OpenAIError
in CI environments where OPENAI_API_KEY is not set. Replaced the top-level
initialization with _get_judge_server(), which creates the server on first
actual use, consistent with how jump_rope/utils.py handles its HF download.
---
 lmms_eval/tasks/mmmu/utils.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
index ecddcd74e..29f5565a9 100755
--- a/lmms_eval/tasks/mmmu/utils.py
+++ b/lmms_eval/tasks/mmmu/utils.py
@@ -30,11 +30,16 @@
 API_TYPE = os.getenv("API_TYPE", "openai")
 MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
 
-# Initialize the judge server
-server_config = ServerConfig(
-    model_name=MODEL_VERSION,
-)
-server = get_server(server_name=API_TYPE, config=server_config)
+_server = None
+
+
+def _get_judge_server():
+    """Return the judge server, initializing it on first call."""
+    global _server
+    if _server is None:
+        server_config = ServerConfig(model_name=MODEL_VERSION)
+        _server = get_server(server_name=API_TYPE, config=server_config)
+    return _server
 
 
 def replace_images_tokens(input_string):
@@ -188,7 +193,7 @@ def mmmu_reasoning_process_results(doc, results):
 
         try:
             # Use the llm_judge API for binary evaluation
-            result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
+            result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
 
             # Parse the result
             if result["success"]:

From 86f7f9a35b0fbe3f0d038961953503dcd55d0cd8 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 15:52:29 +0800
Subject: [PATCH 06/11] ci(task-input-ab): gracefully skip comparison when BASE
 snapshot fails

The BASE worktree may contain pre-existing import-time errors (e.g.
module-level OpenAI client init requiring OPENAI_API_KEY, or network
calls at import time). These cause the BASE capture step to fail, blocking
all PRs even when the PR itself introduces no regression.

Changes:
- Add continue-on-error: true to 'Capture BASE snapshot' step
- Update 'Compare snapshots' to skip diff when base.json is absent,
  printing a clear warning instead of failing the workflow
---
 .github/workflows/task-input-ab.yml | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/task-input-ab.yml b/.github/workflows/task-input-ab.yml
index 2bb77ca66..e287d1b3e 100644
--- a/.github/workflows/task-input-ab.yml
+++ b/.github/workflows/task-input-ab.yml
@@ -87,6 +87,8 @@ jobs:
             --output /tmp/task-input-head.json
 
       - name: Capture BASE snapshot
+        id: base_capture
+        continue-on-error: true
         run: |
           source .venv/bin/activate
           HF_HOME=/tmp/hf-cache python "${{ steps.checker.outputs.checker_path }}" \
@@ -101,12 +103,23 @@ jobs:
           import json
           from pathlib import Path
 
-          base = json.loads(Path('/tmp/task-input-base.json').read_text(encoding='utf-8'))
-          head = json.loads(Path('/tmp/task-input-head.json').read_text(encoding='utf-8'))
-          if base != head:
-              print('Task input snapshot mismatch detected.')
-              raise SystemExit(1)
-          print('Task input snapshots match.')
+          base_path = Path('/tmp/task-input-base.json')
+          head_path = Path('/tmp/task-input-head.json')
+
+          if not base_path.exists():
+              print(
+                  'WARNING: BASE snapshot could not be captured, likely due to a '
+                  'pre-existing import error in the base revision (e.g. missing '
+                  'OPENAI_API_KEY or network access at module import time). '
+                  'Skipping comparison — HEAD snapshot was validated successfully.'
+              )
+          else:
+              base = json.loads(base_path.read_text(encoding='utf-8'))
+              head = json.loads(head_path.read_text(encoding='utf-8'))
+              if base != head:
+                  print('Task input snapshot mismatch detected.')
+                  raise SystemExit(1)
+              print('Task input snapshots match.')
           PY
 
       - name: Upload snapshots on failure

From 1f26f50c433ee427aa0ceab3917315ee18b4c1cb Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 16:27:43 +0800
Subject: [PATCH 07/11] refactor(jump_rope): rename task directory from
 jump_rope to jumpscore

---
 lmms_eval/tasks/{jump_rope => jumpscore}/jumpscore.yaml | 0
 lmms_eval/tasks/{jump_rope => jumpscore}/utils.py       | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename lmms_eval/tasks/{jump_rope => jumpscore}/jumpscore.yaml (100%)
 rename lmms_eval/tasks/{jump_rope => jumpscore}/utils.py (100%)

diff --git a/lmms_eval/tasks/jump_rope/jumpscore.yaml b/lmms_eval/tasks/jumpscore/jumpscore.yaml
similarity index 100%
rename from lmms_eval/tasks/jump_rope/jumpscore.yaml
rename to lmms_eval/tasks/jumpscore/jumpscore.yaml
diff --git a/lmms_eval/tasks/jump_rope/utils.py b/lmms_eval/tasks/jumpscore/utils.py
similarity index 100%
rename from lmms_eval/tasks/jump_rope/utils.py
rename to lmms_eval/tasks/jumpscore/utils.py

From 191ff524f3d86bad51d30d03f6758c9befc71034 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 18:00:22 +0800
Subject: [PATCH 08/11] Revert "fix(mmmu): lazy-load judge server to avoid
 OpenAI API key error on module import"

This reverts commit 917a3ed16241d80a25685a8dcf0ebed3f41589ef.
---
 lmms_eval/tasks/mmmu/utils.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/lmms_eval/tasks/mmmu/utils.py b/lmms_eval/tasks/mmmu/utils.py
index 29f5565a9..ecddcd74e 100755
--- a/lmms_eval/tasks/mmmu/utils.py
+++ b/lmms_eval/tasks/mmmu/utils.py
@@ -30,16 +30,11 @@
 API_TYPE = os.getenv("API_TYPE", "openai")
 MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
 
-_server = None
-
-
-def _get_judge_server():
-    """Return the judge server, initializing it on first call."""
-    global _server
-    if _server is None:
-        server_config = ServerConfig(model_name=MODEL_VERSION)
-        _server = get_server(server_name=API_TYPE, config=server_config)
-    return _server
+# Initialize the judge server
+server_config = ServerConfig(
+    model_name=MODEL_VERSION,
+)
+server = get_server(server_name=API_TYPE, config=server_config)
 
 
 def replace_images_tokens(input_string):
@@ -193,7 +188,7 @@ def mmmu_reasoning_process_results(doc, results):
 
         try:
             # Use the llm_judge API for binary evaluation
-            result = _get_judge_server().evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
+            result = server.evaluate_binary(question=formatted_question, answer=str(answer), prediction=pred, output_format="0/1")
 
             # Parse the result
             if result["success"]:

From 4ecc6833f260e78b4ea4eae81eabd75fbf611d44 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 18:41:47 +0800
Subject: [PATCH 09/11] Revert "ci(task-input-ab): gracefully skip comparison
 when BASE snapshot fails"

This reverts commit 86f7f9a35b0fbe3f0d038961953503dcd55d0cd8.
---
 .github/workflows/task-input-ab.yml | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/task-input-ab.yml b/.github/workflows/task-input-ab.yml
index e287d1b3e..2bb77ca66 100644
--- a/.github/workflows/task-input-ab.yml
+++ b/.github/workflows/task-input-ab.yml
@@ -87,8 +87,6 @@ jobs:
             --output /tmp/task-input-head.json
 
       - name: Capture BASE snapshot
-        id: base_capture
-        continue-on-error: true
         run: |
           source .venv/bin/activate
           HF_HOME=/tmp/hf-cache python "${{ steps.checker.outputs.checker_path }}" \
@@ -103,23 +101,12 @@ jobs:
           import json
           from pathlib import Path
 
-          base_path = Path('/tmp/task-input-base.json')
-          head_path = Path('/tmp/task-input-head.json')
-
-          if not base_path.exists():
-              print(
-                  'WARNING: BASE snapshot could not be captured, likely due to a '
-                  'pre-existing import error in the base revision (e.g. missing '
-                  'OPENAI_API_KEY or network access at module import time). '
-                  'Skipping comparison — HEAD snapshot was validated successfully.'
-              )
-          else:
-              base = json.loads(base_path.read_text(encoding='utf-8'))
-              head = json.loads(head_path.read_text(encoding='utf-8'))
-              if base != head:
-                  print('Task input snapshot mismatch detected.')
-                  raise SystemExit(1)
-              print('Task input snapshots match.')
+          base = json.loads(Path('/tmp/task-input-base.json').read_text(encoding='utf-8'))
+          head = json.loads(Path('/tmp/task-input-head.json').read_text(encoding='utf-8'))
+          if base != head:
+              print('Task input snapshot mismatch detected.')
+              raise SystemExit(1)
+          print('Task input snapshots match.')
           PY
 
       - name: Upload snapshots on failure

From ac2becf80a337fdd2878938ccb23c232056f1a41 Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 18:53:26 +0800
Subject: [PATCH 10/11] fix(jumpscore): configure video cache in yaml

---
 lmms_eval/tasks/jumpscore/jumpscore.yaml |  5 ++++
 lmms_eval/tasks/jumpscore/utils.py       | 38 +++++-------------------
 2 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/lmms_eval/tasks/jumpscore/jumpscore.yaml b/lmms_eval/tasks/jumpscore/jumpscore.yaml
index 92d4ddc2c..7547ef513 100644
--- a/lmms_eval/tasks/jumpscore/jumpscore.yaml
+++ b/lmms_eval/tasks/jumpscore/jumpscore.yaml
@@ -1,4 +1,8 @@
 dataset_path: lmms-lab-encoder/JumpScore
+dataset_kwargs:
+  cache_dir: jumpscore
+  video: true
+  create_link: true
 task: "JumpScore"
 test_split: test
 
@@ -26,6 +30,7 @@ lmms_eval_specific_kwargs:
   default:
     pre_prompt: ""
     post_prompt: ""
+    video_cache_dir: jumpscore
 
 metadata:
   - version: 0.0
diff --git a/lmms_eval/tasks/jumpscore/utils.py b/lmms_eval/tasks/jumpscore/utils.py
index 96864c16b..2d7643213 100644
--- a/lmms_eval/tasks/jumpscore/utils.py
+++ b/lmms_eval/tasks/jumpscore/utils.py
@@ -2,46 +2,24 @@
 import os
 import re
 from collections import defaultdict
-from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
-import yaml
-from huggingface_hub import snapshot_download
 from loguru import logger as eval_logger
 
 
-def _load_dataset_path() -> str:
-    """Load the JumpScore dataset repo from the adjacent task YAML."""
-    yaml_path = Path(__file__).parent / "jumpscore.yaml"
-    with open(yaml_path, "r") as f:
-        safe_lines = [line for line in f if "!function" not in line]
-    config = yaml.safe_load("".join(safe_lines))
-    return str(config["dataset_path"])
-
-
-_JUMPSCORE_CACHE_DIR: Optional[str] = None
-
-
-def _get_cache_dir() -> str:
-    """Return the local HF snapshot directory, downloading on first call."""
-    global _JUMPSCORE_CACHE_DIR
-    if _JUMPSCORE_CACHE_DIR is None:
-        _JUMPSCORE_CACHE_DIR = snapshot_download(
-            repo_id=_load_dataset_path(),
-            repo_type="dataset",
-            local_dir_use_symlinks=False,
-        )
-    return _JUMPSCORE_CACHE_DIR
-
-
-def jumpscore_doc_to_visual(doc: Dict[str, Any]) -> List[str]:
+def jumpscore_doc_to_visual(doc: Dict[str, Any], lmms_eval_specific_kwargs: Optional[Dict[str, Any]] = None) -> List[str]:
     """Return the local video path for a JumpScore sample."""
+    if lmms_eval_specific_kwargs is None:
+        lmms_eval_specific_kwargs = {}
+
     video_ref = str(doc["video_path"])
 
     if os.path.isabs(video_ref):
         video_path = video_ref
     else:
-        cache_dir = _get_cache_dir()
+        hf_home = os.path.expanduser(os.getenv("HF_HOME", "~/.cache/huggingface/"))
+        video_cache_dir = lmms_eval_specific_kwargs.get("video_cache_dir", "jumpscore")
+        cache_dir = os.path.join(hf_home, video_cache_dir)
         candidates = [
             os.path.join(cache_dir, video_ref),
             os.path.join(cache_dir, "videos", video_ref),
@@ -74,7 +52,7 @@ def jumpscore_doc_to_messages(doc: Dict[str, Any], lmms_eval_specific_kwargs: Op
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
 
-    video_path = jumpscore_doc_to_visual(doc)[0]
+    video_path = jumpscore_doc_to_visual(doc, lmms_eval_specific_kwargs)[0]
     count_question = str(doc.get("count_question", "")).replace("<image>", "").strip()
     count_question = re.sub(r"\n+", "\n", count_question).strip()
     count_answer = str(doc.get("count_answer", "")).strip()

From c8ccfc5f915896549751c3c48671ef82c04be6de Mon Sep 17 00:00:00 2001
From: mathCrazyy <1215764141@qq.com>
Date: Mon, 11 May 2026 18:59:21 +0800
Subject: [PATCH 11/11] fix(jumpscore): expose map metric

---
 lmms_eval/tasks/jumpscore/jumpscore.yaml |  3 +++
 lmms_eval/tasks/jumpscore/utils.py       | 25 +++++++++++++-----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/lmms_eval/tasks/jumpscore/jumpscore.yaml b/lmms_eval/tasks/jumpscore/jumpscore.yaml
index 7547ef513..d0cb5edc4 100644
--- a/lmms_eval/tasks/jumpscore/jumpscore.yaml
+++ b/lmms_eval/tasks/jumpscore/jumpscore.yaml
@@ -22,6 +22,9 @@ generation_kwargs:
 process_results: !function utils.jumpscore_process_results
 
 metric_list:
+  - metric: jumpscore_map
+    aggregation: !function utils.jumpscore_aggregate_results
+    higher_is_better: true
   - metric: jumpscore_score
     aggregation: !function utils.jumpscore_aggregate_results
     higher_is_better: true
diff --git a/lmms_eval/tasks/jumpscore/utils.py b/lmms_eval/tasks/jumpscore/utils.py
index 2d7643213..d67f56960 100644
--- a/lmms_eval/tasks/jumpscore/utils.py
+++ b/lmms_eval/tasks/jumpscore/utils.py
@@ -285,18 +285,21 @@ def jumpscore_process_results(doc: Dict[str, Any], results: List[str]) -> Dict[s
             confidences=None,
         )
 
+    result = {
+        "question_id": doc["id"],
+        "map": map_value,
+        "ap_per_tolerance": map_details["ap_per_tolerance"],
+        "pred_starts": pred_starts,
+        "gt_starts": gt_starts,
+        "num_pred": map_details["num_pred"],
+        "num_gt": map_details["num_gt"],
+        "pred_raw": pred_answer_raw[:200] if pred_answer_raw else "",
+        "gt_raw": gt_answer_raw[:200] if gt_answer_raw else "",
+    }
+
     return {
-        "jumpscore_score": {
-            "question_id": doc["id"],
-            "map": map_value,
-            "ap_per_tolerance": map_details["ap_per_tolerance"],
-            "pred_starts": pred_starts,
-            "gt_starts": gt_starts,
-            "num_pred": map_details["num_pred"],
-            "num_gt": map_details["num_gt"],
-            "pred_raw": pred_answer_raw[:200] if pred_answer_raw else "",
-            "gt_raw": gt_answer_raw[:200] if gt_answer_raw else "",
-        }
+        "jumpscore_map": result,
+        "jumpscore_score": result.copy(),
     }