Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions lmms_eval/tasks/charades_sta/charades.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,19 @@ process_results: !function utils.temporal_grounding_process_results_generation


metric_list:
- metric: submission
aggregation: !function utils.temporal_grounding_aggregate_charades
- metric: charades_sta_IOU@3
aggregation: !function utils.temporal_grounding_aggregate_iou3
higher_is_better: true
- metric: charades_sta_IOU@5
aggregation: !function utils.temporal_grounding_aggregate_iou5
higher_is_better: true
- metric: charades_sta_IOU@7
aggregation: !function utils.temporal_grounding_aggregate_iou7
higher_is_better: true
- metric: charades_sta_mIOU
aggregation: !function utils.temporal_grounding_aggregate_miou
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: "Please find the visual event described by a sentence in the video, determining its starting and ending times. The format should be: 'The event happens in the start time - end time'. For example, The event 'person turn a light on' happens in the 24.3 - 30.4 seonds. Now I will give you the textual sentence: "
post_prompt: "Please return its start time and end time."
post_prompt: "Please return its start time and end time."
148 changes: 143 additions & 5 deletions lmms_eval/tasks/charades_sta/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import ast
import datetime
import json
import os
import re
import sys
from pathlib import Path

Expand Down Expand Up @@ -57,10 +59,8 @@ def temporal_grounding_doc_to_text(doc, lmms_eval_specific_kwargs=None):
if lmms_eval_specific_kwargs is None:
lmms_eval_specific_kwargs = {}

if "pre_prompt" in lmms_eval_specific_kwargs:
pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
if "post_prompt" in lmms_eval_specific_kwargs:
post_prompt = lmms_eval_specific_kwargs["post_prompt"]
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")

question = doc["caption"]

Expand All @@ -74,7 +74,145 @@ def temporal_grounding_doc_to_answer(doc):
# Process result for mcq answer generation
def temporal_grounding_process_results_generation(doc, result):
pred = result[0]
return {"submission": {f'{doc["video"]}>>>{doc["caption"]}>>>{doc["timestamp"]}': pred}}
data_dict = {f'{doc["video"]}>>>{doc["caption"]}>>>{doc["timestamp"]}': pred}
return {f"charades_sta_{metric}": data_dict for metric in CHARADES_STA_METRICS}


CHARADES_STA_METRICS = ["IOU@3", "IOU@5", "IOU@7", "mIOU"]


def extract_time(paragraph):
prompt = "A specific example is : 20.8 - 30.0 seconds".lower()
paragraph = paragraph.lower().replace(prompt, "").replace("to", "-")
sentences = re.split(r"[!?\n]", paragraph)

keywords = ["starts", "ends", "happens in", "start time", "end time", "start", "end", "happen"]
candidates = [sentence for sentence in sentences if any(keyword in sentence for keyword in keywords)]
if not candidates:
candidates = sentences

timestamps = []
time_format_range_pattern = re.compile(r"\b(\d{1,2}:\d{2}(?::\d{2})?)\s*[–-]\s*(\d{1,2}:\d{2}(?::\d{2})?)\b")
main_pattern = re.compile(r"(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)")
time_number_pattern = re.compile(r"\b(\d+(?:\.\d+)?)\b")
time_format_pattern = re.compile(r"\b(\d{1,2}:\d{2}(?::\d{2})?)\b")
fallback_pattern = re.compile(r"(\d+(?:\.\d+)?)\s*s?\s*[–-]\s*(\d+(?:\.\d+)?)\s*s?")

for sentence in candidates:
time_matches = time_format_range_pattern.findall(sentence)
if time_matches:
timestamps = [[_time_to_seconds(start), _time_to_seconds(end)] for start, end in time_matches]
break

if not timestamps:
for sentence in candidates:
time_matches = main_pattern.findall(sentence)
if time_matches:
timestamps = [[float(start), float(end)] for start, end in time_matches]
break

if not timestamps:
times = []
for sentence in candidates:
time = time_format_pattern.findall(sentence)
if not time:
continue
times.extend(_time_to_seconds(timestamp) for timestamp in time)
times = times[: len(times) // 2 * 2]
timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]

if not timestamps:
times = []
for sentence in candidates:
time = time_number_pattern.findall(sentence)
if time:
times.append(float(time[0]))
times = times[: len(times) // 2 * 2]
timestamps = [(times[i], times[i + 1]) for i in range(0, len(times), 2)]

if not timestamps:
for sentence in candidates:
fallback_matches = fallback_pattern.findall(sentence)
if fallback_matches:
timestamps = [[float(start), float(end)] for start, end in fallback_matches]
break

results = []
for start, end in timestamps[:1]:
results.append([start, end] if end > start else [end, start])
return results


def _time_to_seconds(timestamp):
parts = timestamp.split(":")
if len(parts) == 3:
hours, minutes, seconds = map(float, parts)
return hours * 3600 + minutes * 60 + seconds
minutes, seconds = map(float, parts)
return minutes * 60 + seconds


def iou(a, b):
max0 = max(a[0], b[0])
min0 = min(a[0], b[0])
max1 = max(a[1], b[1])
min1 = min(a[1], b[1])
denom = max1 - min0
return 0.0 if denom <= 0 else max(min1 - max0, 0) / denom


def _parse_ground_truth(raw_gt):
if isinstance(raw_gt, str):
raw_gt = ast.literal_eval(raw_gt)
return float(raw_gt[0]), float(raw_gt[1])


def _temporal_grounding_compute_metrics(results):
combined_submission = {}
for submission_dict in results:
combined_submission.update(submission_dict)

ious = []
bad_pred = 0
for key, pred_text in combined_submission.items():
try:
gt = _parse_ground_truth(key.rsplit(">>>", 1)[-1])
pred_times = extract_time(pred_text)
if len(pred_times) != 1:
cur_iou = 0.0
bad_pred += 1
else:
cur_iou = iou(gt, pred_times[0])
ious.append(cur_iou)
except Exception as e:
eval_logger.warning(f"Failed to process Charades-STA result: {e}")
ious.append(0.0)
bad_pred += 1

total = len(ious)
eval_logger.info(f"Charades-STA bad predictions: {bad_pred}/{total}")
metrics = {}
for thr in [0.3, 0.5, 0.7]:
count = sum(1 for value in ious if value >= thr)
metrics[f"IOU@{int(thr * 10)}"] = count * 100 / total if total else 0
metrics["mIOU"] = sum(ious) * 100 / total if total else 0
return metrics


def temporal_grounding_aggregate_iou3(results, args):
return _temporal_grounding_compute_metrics(results)["IOU@3"]


def temporal_grounding_aggregate_iou5(results, args):
return _temporal_grounding_compute_metrics(results)["IOU@5"]


def temporal_grounding_aggregate_iou7(results, args):
return _temporal_grounding_compute_metrics(results)["IOU@7"]


def temporal_grounding_aggregate_miou(results, args):
return _temporal_grounding_compute_metrics(results)["mIOU"]


def temporal_grounding_aggregate_charades(results, args):
Expand Down
36 changes: 36 additions & 0 deletions lmms_eval/tasks/timelens/_default_yaml_template
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
dataset_path: kcz358/timelens
dataset_kwargs:
cache_dir: timelens
video: True

generation_kwargs:
max_new_tokens: 50
temperature: 0
top_p: 1.0
num_beams: 1
do_sample: false

output_type: generate_until
doc_to_visual: !function utils.timelens_doc_to_visual
doc_to_text: !function utils.timelens_doc_to_text
doc_to_target: !function utils.timelens_doc_to_target
process_results: !function utils.timelens_process_results

metric_list:
- metric: timelens_IOU@3
aggregation: !function utils.timelens_aggregate_iou3
higher_is_better: true
- metric: timelens_IOU@5
aggregation: !function utils.timelens_aggregate_iou5
higher_is_better: true
- metric: timelens_IOU@7
aggregation: !function utils.timelens_aggregate_iou7
higher_is_better: true
- metric: timelens_mIOU
aggregation: !function utils.timelens_aggregate_miou
higher_is_better: true

lmms_eval_specific_kwargs:
default:
pre_prompt: "Please find the visual event described by the sentence '"
post_prompt: "', determining its starting and ending times. The format should be: 'The event happens in <start time> - <end time> seconds'."
5 changes: 5 additions & 0 deletions lmms_eval/tasks/timelens/timelens.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group: timelens
task:
- timelens_activitynet
- timelens_charades
- timelens_qvhighlights
4 changes: 4 additions & 0 deletions lmms_eval/tasks/timelens/timelens_activitynet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_yaml_template

task: timelens_activitynet
test_split: activitynet
4 changes: 4 additions & 0 deletions lmms_eval/tasks/timelens/timelens_charades.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_yaml_template

task: timelens_charades
test_split: charades
4 changes: 4 additions & 0 deletions lmms_eval/tasks/timelens/timelens_qvhighlights.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
include: _default_yaml_template

task: timelens_qvhighlights
test_split: qvhighlights
Loading
Loading