EvolvingLMMs-Lab · kcz358 · May 6, 2026 · May 6, 2026
diff --git a/lmms_eval/tasks/charades_sta/charades.yaml b/lmms_eval/tasks/charades_sta/charades.yaml
@@ -21,10 +21,19 @@ process_results: !function utils.temporal_grounding_process_results_generation
 
 
 metric_list:
-  - metric: submission
-    aggregation: !function utils.temporal_grounding_aggregate_charades
+  - metric: charades_sta_IOU@3
+    aggregation: !function utils.temporal_grounding_aggregate_iou3
+    higher_is_better: true
+  - metric: charades_sta_IOU@5
+    aggregation: !function utils.temporal_grounding_aggregate_iou5
+    higher_is_better: true
+  - metric: charades_sta_IOU@7
+    aggregation: !function utils.temporal_grounding_aggregate_iou7
+    higher_is_better: true
+  - metric: charades_sta_mIOU
+    aggregation: !function utils.temporal_grounding_aggregate_miou
     higher_is_better: true
 lmms_eval_specific_kwargs:
   default:
     pre_prompt: "Please find the visual event described by a sentence in the video, determining its starting and ending times. The format should be: 'The event happens in the start time - end time'. For example, The event 'person turn a light on' happens in the 24.3 - 30.4 seonds. Now I will give you the textual sentence: "
-    post_prompt: "Please return its start time and end time."
+    post_prompt: "Please return its start time and end time."
diff --git a/lmms_eval/tasks/charades_sta/utils.py b/lmms_eval/tasks/charades_sta/utils.py
@@ -1,6 +1,8 @@
+import ast
 import datetime
 import json
 import os
+import re
 import sys
 from pathlib import Path
 
@@ -57,10 +59,8 @@ def temporal_grounding_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     if lmms_eval_specific_kwargs is None:
         lmms_eval_specific_kwargs = {}
 
-    if "pre_prompt" in lmms_eval_specific_kwargs:
-        pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
-    if "post_prompt" in lmms_eval_specific_kwargs:
-        post_prompt = lmms_eval_specific_kwargs["post_prompt"]
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
 
     question = doc["caption"]
 
@@ -74,7 +74,145 @@ def temporal_grounding_doc_to_answer(doc):
 # Process result for mcq answer generation
 def temporal_grounding_process_results_generation(doc, result):
     pred = result[0]
-    return {"submission": {f'{doc["video"]}>>>{doc["caption"]}>>>{doc["timestamp"]}': pred}}
+    data_dict = {f'{doc["video"]}>>>{doc["caption"]}>>>{doc["timestamp"]}': pred}
+    return {f"charades_sta_{metric}": data_dict for metric in CHARADES_STA_METRICS}
+
+
+CHARADES_STA_METRICS = ["IOU@3", "IOU@5", "IOU@7", "mIOU"]
+
+
+def extract_time(paragraph):
+    prompt = "A specific example is : 20.8 - 30.0 seconds".lower()
+    paragraph = paragraph.lower().replace(prompt, "").replace("to", "-")
+    sentences = re.split(r"[!?\n]", paragraph)
+
+    keywords = ["starts", "ends", "happens in", "start time", "end time", "start", "end", "happen"]
+    candidates = [sentence for sentence in sentences if any(keyword in sentence for keyword in keywords)]
+    if not candidates:
+        candidates = sentences
+
+    timestamps = []
+    time_format_range_pattern = re.compile(r"\b(\d{1,2}:\d{2}(?::\d{2})?)\s*[–-]\s*(\d{1,2}:\d{2}(?::\d{2})?)\b")
+    main_pattern = re.compile(r"(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)")
+    time_number_pattern = re.compile(r"\b(\d+(?:\.\d+)?)\b")
+    time_format_pattern = re.compile(r"\b(\d{1,2}:\d{2}(?::\d{2})?)\b")
+    fallback_pattern = re.compile(r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?")
+
+    for sentence in candidates:
+        time_matches = time_format_range_pattern.findall(sentence)
+        if time_matches:
+            timestamps = [[_time_to_seconds(start), _time_to_seconds(end)] for start, end in time_matches]
+            break
+
+    if not timestamps:
+        for sentence in candidates:
+            time_matches = main_pattern.findall(sentence)
+            if time_matches:
+                timestamps = [[float(start), float(end)] for start, end in time_matches]
+                break
+
+    if not timestamps:
+        times = []
+        for sentence in candidates:
+            time = time_format_pattern.findall(sentence)
+            if not time:
+                continue
+            times.extend(_time_to_seconds(timestamp) for timestamp in time)
+        times = times[: len(times) // 2 * 2]
+        timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
+
+    if not timestamps:
+        times = []
+        for sentence in candidates:
+            time = time_number_pattern.findall(sentence)
+            if time:
+                times.append(float(time[0]))
+        times = times[: len(times) // 2 * 2]
+        timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]
+
+    if not timestamps:
+        for sentence in candidates:
+            fallback_matches = fallback_pattern.findall(sentence)
+            if fallback_matches:
+                timestamps = [[float(start), float(end)] for start, end in fallback_matches]
+                break
+
+    results = []
+    for start, end in timestamps[:1]:
+        results.append([start, end] if end > start else [end, start])
+    return results
+
+
+def _time_to_seconds(timestamp):
+    parts = timestamp.split(":")
+    if len(parts) == 3:
+        hours, minutes, seconds = map(float, parts)
+        return hours * 3600 + minutes * 60 + seconds
+    minutes, seconds = map(float, parts)
+    return minutes * 60 + seconds
+
+
+def iou(a, b):
+    max0 = max(a[0], b[0])
+    min0 = min(a[0], b[0])
+    max1 = max(a[1], b[1])
+    min1 = min(a[1], b[1])
+    denom = max1 - min0
+    return 0.0 if denom <= 0 else max(min1 - max0, 0) / denom
+
+
+def _parse_ground_truth(raw_gt):
+    if isinstance(raw_gt, str):
+        raw_gt = ast.literal_eval(raw_gt)
+    return float(raw_gt[0]), float(raw_gt[1])
+
+
+def _temporal_grounding_compute_metrics(results):
+    combined_submission = {}
+    for submission_dict in results:
+        combined_submission.update(submission_dict)
+
+    ious = []
+    bad_pred = 0
+    for key, pred_text in combined_submission.items():
+        try:
+            gt = _parse_ground_truth(key.rsplit(">>>", 1)[-1])
+            pred_times = extract_time(pred_text)
+            if len(pred_times) != 1:
+                cur_iou = 0.0
+                bad_pred += 1
+            else:
+                cur_iou = iou(gt, pred_times[0])
+            ious.append(cur_iou)
+        except Exception as e:
+            eval_logger.warning(f"Failed to process Charades-STA result: {e}")
+            ious.append(0.0)
+            bad_pred += 1
+
+    total = len(ious)
+    eval_logger.info(f"Charades-STA bad predictions: {bad_pred}/{total}")
+    metrics = {}
+    for thr in [0.3, 0.5, 0.7]:
+        count = sum(1 for value in ious if value >= thr)
+        metrics[f"IOU@{int(thr * 10)}"] = count * 100 / total if total else 0
+    metrics["mIOU"] = sum(ious) * 100 / total if total else 0
+    return metrics
+
+
+def temporal_grounding_aggregate_iou3(results, args):
+    return _temporal_grounding_compute_metrics(results)["IOU@3"]
+
+
+def temporal_grounding_aggregate_iou5(results, args):
+    return _temporal_grounding_compute_metrics(results)["IOU@5"]
+
+
+def temporal_grounding_aggregate_iou7(results, args):
+    return _temporal_grounding_compute_metrics(results)["IOU@7"]
+
+
+def temporal_grounding_aggregate_miou(results, args):
+    return _temporal_grounding_compute_metrics(results)["mIOU"]
 
 
 def temporal_grounding_aggregate_charades(results, args):

diff --git a/lmms_eval/tasks/timelens/_default_yaml_template b/lmms_eval/tasks/timelens/_default_yaml_template
@@ -0,0 +1,36 @@
+dataset_path: kcz358/timelens
+dataset_kwargs:
+  cache_dir: timelens
+  video: True
+
+generation_kwargs:
+  max_new_tokens: 50
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+
+output_type: generate_until
+doc_to_visual: !function utils.timelens_doc_to_visual
+doc_to_text: !function utils.timelens_doc_to_text
+doc_to_target: !function utils.timelens_doc_to_target
+process_results: !function utils.timelens_process_results
+
+metric_list:
+  - metric: timelens_IOU@3
+    aggregation: !function utils.timelens_aggregate_iou3
+    higher_is_better: true
+  - metric: timelens_IOU@5
+    aggregation: !function utils.timelens_aggregate_iou5
+    higher_is_better: true
+  - metric: timelens_IOU@7
+    aggregation: !function utils.timelens_aggregate_iou7
+    higher_is_better: true
+  - metric: timelens_mIOU
+    aggregation: !function utils.timelens_aggregate_miou
+    higher_is_better: true
+
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: "Please find the visual event described by the sentence '"
+    post_prompt: "', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
diff --git a/lmms_eval/tasks/timelens/timelens.yaml b/lmms_eval/tasks/timelens/timelens.yaml
@@ -0,0 +1,5 @@
+group: timelens
+task:
+- timelens_activitynet
+- timelens_charades
+- timelens_qvhighlights
diff --git a/lmms_eval/tasks/timelens/timelens_activitynet.yaml b/lmms_eval/tasks/timelens/timelens_activitynet.yaml
@@ -0,0 +1,4 @@
+include: _default_yaml_template
+
+task: timelens_activitynet
+test_split: activitynet
diff --git a/lmms_eval/tasks/timelens/timelens_charades.yaml b/lmms_eval/tasks/timelens/timelens_charades.yaml
@@ -0,0 +1,4 @@
+include: _default_yaml_template
+
+task: timelens_charades
+test_split: charades
diff --git a/lmms_eval/tasks/timelens/timelens_qvhighlights.yaml b/lmms_eval/tasks/timelens/timelens_qvhighlights.yaml
@@ -0,0 +1,4 @@
+include: _default_yaml_template
+
+task: timelens_qvhighlights
+test_split: qvhighlights