Skip to content

Commit 1579b37

Browse files
committed
Evaluation scripts after study, broken out from sign-language-processing#30
1 parent d3347a1 commit 1579b37

20 files changed

+3831
-12
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@ pose_evaluation.egg-info/
77
*.npz
88
*.code-workspace
99
.vscode/
10-
coverage.lcov
10+
coverage.lcov
11+
*.zip

pose_evaluation/evaluation/dataset_parsing/__init__.py

Whitespace-only changes.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import typer
2+
from pathlib import Path
3+
4+
import pandas as pd
5+
6+
from pose_evaluation.evaluation.dataset_parsing.collect_files import parse_id_and_model_name_from_embedding_file
7+
from pose_evaluation.evaluation.dataset_parsing.dataset_utils import file_paths_list_to_df, DatasetDFCol
8+
9+
app = typer.Typer()
10+
11+
12+
def get_embeddings_df(embeddings_folder: Path, split_id_on_dash=False):
13+
embedding_files = list(embeddings_folder.rglob("*using-model*.npy"))
14+
prefix = "EMBEDDING"
15+
files_df = file_paths_list_to_df(embedding_files, prefix=prefix)
16+
# id, model_name = parse_id_and_model_name_from_embedding_file(path)
17+
# files_df[DatasetDFCol.VIDEO_ID]
18+
# files_df[DatasetDFCol.EMBEDDING_MODEL]
19+
files_df[[DatasetDFCol.VIDEO_ID, DatasetDFCol.EMBEDDING_MODEL]] = files_df[f"{prefix}_FILE_PATH"].apply(
20+
lambda path: pd.Series(parse_id_and_model_name_from_embedding_file(path))
21+
)
22+
23+
return files_df
24+
25+
26+
@app.command()
27+
def process(
28+
input_csv: Path = typer.Argument(..., exists=True, help="Path to input CSV file"),
29+
embeddings_folder: Path = typer.Argument(..., exists=True, file_okay=False, help="Path to folder with embeddings"),
30+
output_csv: Path = typer.Option(Path("output.csv"), help="Path to output CSV file"),
31+
):
32+
typer.echo(f"Reading input CSV: {input_csv}")
33+
typer.echo(f"Using embeddings from: {embeddings_folder}")
34+
typer.echo(f"Will write output to: {output_csv}")
35+
36+
dataset_df = pd.read_csv(input_csv)
37+
typer.echo("**** Dataset DF: ****")
38+
typer.echo(dataset_df.head())
39+
typer.echo(dataset_df.info())
40+
typer.echo()
41+
42+
typer.echo("**** Embedding DF: ****")
43+
embeddings_df = get_embeddings_df(embeddings_folder)
44+
typer.echo(embeddings_df.head())
45+
typer.echo(embeddings_df.info())
46+
typer.echo()
47+
48+
typer.echo("**** Merged DF: ****")
49+
merged_df = dataset_df.merge(embeddings_df, on=DatasetDFCol.VIDEO_ID, how="left")
50+
if len(merged_df) == len(dataset_df):
51+
# ASL Citizen has slightly different filenames...
52+
embeddings_df[DatasetDFCol.VIDEO_ID] = embeddings_df[DatasetDFCol.VIDEO_ID].astype(str).str.split("-").str[0]
53+
merged_df = dataset_df.merge(embeddings_df, on=DatasetDFCol.VIDEO_ID, how="left")
54+
typer.echo(merged_df.head())
55+
typer.echo(merged_df.info())
56+
typer.echo()
57+
58+
merged_df.to_csv(output_csv, index=False)
59+
60+
61+
if __name__ == "__main__":
62+
app()
63+
# python /opt/home/cleong/projects/pose-evaluation/pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py dataset_dfs/semlex.csv /opt/home/cleong/data/Sem-Lex/embeddings/ --output-csv /opt/home/cleong/projects/pose-evaluation/dataset_dfs_with_embed/semlex.csv
64+
# python /opt/home/cleong/projects/pose-evaluation/pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py dataset_dfs/asl-citizen.csv /opt/home/cleong/data/ASL_Citizen/re-embed/ --output-csv /opt/home/cleong/projects/pose-evaluation/dataset_dfs_with_embed/asl-citizen.csv
65+
# python /opt/home/cleong/projects/pose-evaluation/pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py dataset_dfs/popsign_asl.csv /opt/home/cleong/projects/semantic_and_visual_similarity/local_data/PopSignASL/embeddings --output-csv /opt/home/cleong/projects/pose-evaluation/dataset_dfs_with_embed/popsign_asl.csv
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from pathlib import Path
2+
from typing import Optional
3+
4+
import pandas as pd
5+
import typer
6+
7+
from pose_evaluation.evaluation.dataset_parsing.collect_files import collect_files_main
8+
from pose_evaluation.evaluation.dataset_parsing.dataset_utils import (
9+
file_paths_list_to_df,
10+
deduplicate_by_video_id,
11+
df_to_standardized_df,
12+
DatasetDFCol,
13+
)
14+
15+
app = typer.Typer()
16+
17+
18+
@app.command()
19+
def collect(
20+
dataset_path: Path = typer.Argument(..., exists=True, file_okay=False),
21+
pose_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
22+
metadata_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
23+
video_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
24+
out: Optional[Path] = typer.Option(None, exists=False, file_okay=True),
25+
):
26+
"""Read in Sem-Lex files and metadata, combine to one dataframe, and save out to csv"""
27+
# pylint: disable=duplicate-code
28+
result = collect_files_main(
29+
dataset_path=dataset_path,
30+
pose_files_path=pose_files_path,
31+
metadata_path=metadata_path,
32+
video_files_path=video_files_path,
33+
pose_patterns=["*.pose"],
34+
metadata_patterns=["train.csv", "val.csv", "test.csv"],
35+
video_patterns=["*.mp4"],
36+
)
37+
# pylint: enable=duplicate-code
38+
39+
for name, paths in result.items():
40+
typer.echo(f"🎯 Found {len(paths)} {name.replace('_', ' ')}. Samples:")
41+
for path in paths[:3]:
42+
typer.echo(f"* {path}")
43+
44+
# metadata
45+
meta_dfs = []
46+
for meta_file in result["METADATA_FILES"]:
47+
split_name = meta_file.stem
48+
49+
df = pd.read_csv(meta_file, index_col=0, header=0)
50+
51+
# 8336197103293617-CHAMP.mp4 becomes 8336197103293617
52+
df[DatasetDFCol.VIDEO_ID] = df["Video file"].apply(lambda x: Path(x).stem.split("-")[0])
53+
54+
typer.echo(f"Found metadata file: {meta_file}")
55+
56+
df["SPLIT"] = split_name
57+
df = df_to_standardized_df(
58+
df,
59+
)
60+
61+
meta_dfs.append(df)
62+
63+
df = pd.concat(meta_dfs)
64+
typer.echo(f"Deduplicating by video ID and split, currently there are {len(df)} rows")
65+
df = deduplicate_by_video_id(
66+
df, video_id_col="VIDEO_ID", split_col="SPLIT", priority_order=["train", "val", "test"]
67+
)
68+
typer.echo(f"There are now {len(df)} rows")
69+
70+
for prefix in ["POSE", "VIDEO"]:
71+
72+
files_df = file_paths_list_to_df(result[f"{prefix}_FILES"], prefix=prefix)
73+
files_df[DatasetDFCol.VIDEO_ID] = files_df[f"{prefix}_FILE_PATH"].apply(lambda x: Path(x).stem.split("-")[0])
74+
# typer.echo(files_df.head())
75+
typer.echo(f"Merging {len(files_df)} {prefix} files into df")
76+
df = df.merge(files_df, on=DatasetDFCol.VIDEO_ID, how="left")
77+
df = df_to_standardized_df(
78+
df,
79+
video_id_col=DatasetDFCol.VIDEO_ID,
80+
split_col=DatasetDFCol.SPLIT,
81+
gloss_col=DatasetDFCol.GLOSS,
82+
)
83+
typer.echo(df.info())
84+
typer.echo(df.head())
85+
if out is not None:
86+
if out.name.endswith(".csv"):
87+
df.to_csv(out, index=False)
88+
if out.name.endswith(".json"):
89+
df.to_json(out)
90+
91+
92+
if __name__ == "__main__":
93+
app()
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from pathlib import Path
2+
import fnmatch
3+
from typing import Optional, List, Dict
4+
import typer
5+
6+
app = typer.Typer()
7+
8+
9+
def parse_model_name_from_embedding_file(file_path: Path) -> str:
10+
"""
11+
Extract the model name from a file name like:
12+
"some-id-using-model-modelname_checkpoint_best.npy"
13+
"""
14+
name = file_path.name
15+
if "-using-model-" not in name:
16+
raise ValueError(f"No model name found in file: {file_path}")
17+
return name.split("-using-model-")[-1].removesuffix(".npy")
18+
19+
20+
def collect_files_once(
21+
base: Path,
22+
pattern_map: Dict[str, List[str]],
23+
) -> Dict[str, List[Path]]:
24+
"""Walk the directory once and classify files by matching patterns."""
25+
result = {key: [] for key in pattern_map}
26+
for f in base.rglob("*"):
27+
if not f.is_file():
28+
continue
29+
for category, patterns in pattern_map.items():
30+
if any(fnmatch.fnmatch(f.name, pattern) for pattern in patterns):
31+
result[category].append(f)
32+
33+
# Sort all lists for consistency
34+
for files in result.values():
35+
files.sort()
36+
37+
for name, paths in result.items():
38+
typer.echo(f"🎯 Found {len(paths)} {name.replace('_', ' ')}. Samples:")
39+
for path in paths[:3]:
40+
if name == "embedding":
41+
try:
42+
model_name = parse_model_name_from_embedding_file(path)
43+
typer.echo(f"* {path} → model: {model_name}")
44+
except ValueError as e:
45+
typer.echo(f"* {path} (⚠️ {e})")
46+
else:
47+
typer.echo(f"* {path}")
48+
return result
49+
50+
51+
def collect_files_main(
52+
dataset_path: Path,
53+
pose_files_path: Optional[Path] = None,
54+
metadata_path: Optional[Path] = None,
55+
video_files_path: Optional[Path] = None,
56+
embedding_files_path: Optional[Path] = None,
57+
pose_patterns: Optional[List[str]] = None,
58+
metadata_patterns: Optional[List[str]] = None,
59+
video_patterns: Optional[List[str]] = None,
60+
embedding_patterns: Optional[List[str]] = None,
61+
):
62+
"""Efficiently collect all files by walking each root directory only once."""
63+
if pose_patterns is None:
64+
pose_patterns = ["*.pose", "*.pose.zst"]
65+
if metadata_patterns is None:
66+
metadata_patterns = ["*.csv"]
67+
if video_patterns is None:
68+
video_patterns = ["*.mp4", "*.avi", "*.mov"]
69+
if embedding_patterns is None:
70+
embedding_patterns = ["*.npy"]
71+
72+
result = {}
73+
74+
search_roots = {
75+
"pose": (pose_files_path or dataset_path, pose_patterns),
76+
"metadata": (metadata_path or dataset_path, metadata_patterns),
77+
"video": (video_files_path or dataset_path, video_patterns),
78+
"embedding": (embedding_files_path or dataset_path, embedding_patterns),
79+
}
80+
81+
# Group by root to avoid repeated walks
82+
root_to_keys = {}
83+
for key, (root, patterns) in search_roots.items():
84+
if patterns is not None:
85+
root_to_keys.setdefault(root, []).append((key, patterns))
86+
87+
for root, keys_and_patterns in root_to_keys.items():
88+
pattern_map = dict(keys_and_patterns)
89+
root_results = collect_files_once(root, pattern_map)
90+
result.update({f"{key.upper()}_FILES": root_results[key] for key in pattern_map})
91+
92+
return result
93+
94+
95+
@app.command()
96+
def collect_files_cli(
97+
dataset_path: Path = typer.Argument(..., exists=True, file_okay=False),
98+
pose_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
99+
metadata_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
100+
video_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
101+
embedding_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
102+
pose_patterns: List[str] = typer.Option(["*.pose", "*.pose.zst"]),
103+
metadata_patterns: List[str] = typer.Option(["*.csv"]),
104+
video_patterns: List[str] = typer.Option(["*.mp4", "*.avi", "*.mov"]),
105+
embedding_patterns: List[str] = typer.Option(["*.npy"]),
106+
):
107+
"""CLI wrapper around collect_files_main"""
108+
result = collect_files_main(
109+
dataset_path=dataset_path,
110+
pose_files_path=pose_files_path,
111+
metadata_path=metadata_path,
112+
video_files_path=video_files_path,
113+
embedding_files_path=embedding_files_path,
114+
pose_patterns=pose_patterns,
115+
metadata_patterns=metadata_patterns,
116+
video_patterns=video_patterns,
117+
embedding_patterns=embedding_patterns,
118+
)
119+
120+
for name, paths in result.items():
121+
typer.echo(f"✅ Found {len(paths)} {name.replace('_', ' ')}")
122+
123+
124+
if __name__ == "__main__":
125+
app()
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
from pathlib import Path
5+
import pandas as pd
6+
7+
from tqdm import tqdm
8+
9+
from pose_evaluation.evaluation.score_dataframe_format import load_score_csv
10+
11+
12+
def convert_csvs_to_parquet(
13+
folder: Path, out_dir: Path | None = None, remove_original: bool = False, score_csv_format=False
14+
):
15+
csv_files = list(folder.glob("*.csv"))
16+
if not csv_files:
17+
print(f"No CSV files found in {folder}")
18+
return
19+
20+
if score_csv_format:
21+
print(f"Using score csv format")
22+
23+
out_dir = out_dir or folder
24+
out_dir.mkdir(parents=True, exist_ok=True)
25+
26+
for csv_path in tqdm(csv_files, desc="Converting CSV files to Parquet"):
27+
28+
if score_csv_format:
29+
df = load_score_csv(csv_path)
30+
else:
31+
df = pd.read_csv(csv_path)
32+
parquet_path = out_dir / csv_path.with_suffix(".parquet").name
33+
df.to_parquet(parquet_path, index=False)
34+
# print(f"Converted: {csv_path.name} → {parquet_path}")
35+
if remove_original:
36+
csv_path.unlink()
37+
print(f"Deleted original: {csv_path.name}")
38+
39+
40+
def main():
41+
parser = argparse.ArgumentParser(description="Convert CSV files in a folder to Parquet.")
42+
parser.add_argument("folder", type=Path, help="Path to folder containing CSV files")
43+
parser.add_argument("-o", "--out", type=Path, help="Optional output folder for Parquet files")
44+
parser.add_argument("--remove", action="store_true", help="Remove original CSV files after conversion")
45+
parser.add_argument(
46+
"--not-score-files",
47+
action="store_true",
48+
help="By default, will load with specific score CSV columns, datatypes. If this is given, will just use pd.read_csv.",
49+
)
50+
args = parser.parse_args()
51+
52+
if not args.folder.is_dir():
53+
print(f"Error: {args.folder} is not a directory.")
54+
return
55+
use_score_files_format = not args.not_score_files
56+
convert_csvs_to_parquet(
57+
folder=args.folder, out_dir=args.out, remove_original=args.remove, score_csv_format=use_score_files_format
58+
)
59+
60+
61+
if __name__ == "__main__":
62+
main()

0 commit comments

Comments
 (0)