cleong110
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎pose_evaluation/evaluation/dataset_parsing/__init__.py b/‎pose_evaluation/evaluation/dataset_parsing/__init__.py
diff --git a/‎pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py
Lines changed: 65 additions & 0 deletions b/‎pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py
Lines changed: 65 additions & 0 deletions
diff --git a/‎pose_evaluation/evaluation/dataset_parsing/asl_citizen_to_dataframe.py
Lines changed: 93 additions & 0 deletions b/‎pose_evaluation/evaluation/dataset_parsing/asl_citizen_to_dataframe.py
Lines changed: 93 additions & 0 deletions
diff --git a/‎pose_evaluation/evaluation/dataset_parsing/collect_files.py
Lines changed: 125 additions & 0 deletions b/‎pose_evaluation/evaluation/dataset_parsing/collect_files.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎pose_evaluation/evaluation/dataset_parsing/convert_folder_to_parquet.py
Lines changed: 62 additions & 0 deletions b/‎pose_evaluation/evaluation/dataset_parsing/convert_folder_to_parquet.py
Lines changed: 62 additions & 0 deletions
@@ -7,4 +7,5 @@ pose_evaluation.egg-info/
 *.npz
 *.code-workspace
 .vscode/
-coverage.lcov
+coverage.lcov
+*.zip
@@ -0,0 +1,65 @@
+import typer
+from pathlib import Path
+
+import pandas as pd
+
+from pose_evaluation.evaluation.dataset_parsing.collect_files import parse_id_and_model_name_from_embedding_file
+from pose_evaluation.evaluation.dataset_parsing.dataset_utils import file_paths_list_to_df, DatasetDFCol
+
+app = typer.Typer()
+
+
+def get_embeddings_df(embeddings_folder: Path, split_id_on_dash=False):
+    embedding_files = list(embeddings_folder.rglob("*using-model*.npy"))
+    prefix = "EMBEDDING"
+    files_df = file_paths_list_to_df(embedding_files, prefix=prefix)
+    # id, model_name = parse_id_and_model_name_from_embedding_file(path)
+    # files_df[DatasetDFCol.VIDEO_ID]
+    # files_df[DatasetDFCol.EMBEDDING_MODEL]
+    files_df[[DatasetDFCol.VIDEO_ID, DatasetDFCol.EMBEDDING_MODEL]] = files_df[f"{prefix}_FILE_PATH"].apply(
+        lambda path: pd.Series(parse_id_and_model_name_from_embedding_file(path))
+    )
+
+    return files_df
+
+
+@app.command()
+def process(
+    input_csv: Path = typer.Argument(..., exists=True, help="Path to input CSV file"),
+    embeddings_folder: Path = typer.Argument(..., exists=True, file_okay=False, help="Path to folder with embeddings"),
+    output_csv: Path = typer.Option(Path("output.csv"), help="Path to output CSV file"),
+):
+    typer.echo(f"Reading input CSV: {input_csv}")
+    typer.echo(f"Using embeddings from: {embeddings_folder}")
+    typer.echo(f"Will write output to: {output_csv}")
+
+    dataset_df = pd.read_csv(input_csv)
+    typer.echo("**** Dataset DF: ****")
+    typer.echo(dataset_df.head())
+    typer.echo(dataset_df.info())
+    typer.echo()
+
+    typer.echo("**** Embedding DF: ****")
+    embeddings_df = get_embeddings_df(embeddings_folder)
+    typer.echo(embeddings_df.head())
+    typer.echo(embeddings_df.info())
+    typer.echo()
+
+    typer.echo("**** Merged DF: ****")
+    merged_df = dataset_df.merge(embeddings_df, on=DatasetDFCol.VIDEO_ID, how="left")
+    if len(merged_df) == len(dataset_df):
+        # ASL Citizen has slightly different filenames...
+        embeddings_df[DatasetDFCol.VIDEO_ID] = embeddings_df[DatasetDFCol.VIDEO_ID].astype(str).str.split("-").str[0]
+        merged_df = dataset_df.merge(embeddings_df, on=DatasetDFCol.VIDEO_ID, how="left")
+    typer.echo(merged_df.head())
+    typer.echo(merged_df.info())
+    typer.echo()
+
+    merged_df.to_csv(output_csv, index=False)
+
+
+if __name__ == "__main__":
+    app()
+# python /opt/home/cleong/projects/pose-evaluation/pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py dataset_dfs/semlex.csv /opt/home/cleong/data/Sem-Lex/embeddings/ --output-csv /opt/home/cleong/projects/pose-evaluation/dataset_dfs_with_embed/semlex.csv
+# python /opt/home/cleong/projects/pose-evaluation/pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py dataset_dfs/asl-citizen.csv /opt/home/cleong/data/ASL_Citizen/re-embed/ --output-csv /opt/home/cleong/projects/pose-evaluation/dataset_dfs_with_embed/asl-citizen.csv
+# python /opt/home/cleong/projects/pose-evaluation/pose_evaluation/evaluation/dataset_parsing/add_embeddings_to_dataset_dataframe.py dataset_dfs/popsign_asl.csv /opt/home/cleong/projects/semantic_and_visual_similarity/local_data/PopSignASL/embeddings --output-csv /opt/home/cleong/projects/pose-evaluation/dataset_dfs_with_embed/popsign_asl.csv
@@ -0,0 +1,93 @@
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+import typer
+
+from pose_evaluation.evaluation.dataset_parsing.collect_files import collect_files_main
+from pose_evaluation.evaluation.dataset_parsing.dataset_utils import (
+    file_paths_list_to_df,
+    deduplicate_by_video_id,
+    df_to_standardized_df,
+    DatasetDFCol,
+)
+
+app = typer.Typer()
+
+
+@app.command()
+def collect(
+    dataset_path: Path = typer.Argument(..., exists=True, file_okay=False),
+    pose_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    metadata_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    video_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    out: Optional[Path] = typer.Option(None, exists=False, file_okay=True),
+):
+    """Read in Sem-Lex files and metadata, combine to one dataframe, and save out to csv"""
+    # pylint: disable=duplicate-code
+    result = collect_files_main(
+        dataset_path=dataset_path,
+        pose_files_path=pose_files_path,
+        metadata_path=metadata_path,
+        video_files_path=video_files_path,
+        pose_patterns=["*.pose"],
+        metadata_patterns=["train.csv", "val.csv", "test.csv"],
+        video_patterns=["*.mp4"],
+    )
+    # pylint: enable=duplicate-code
+
+    for name, paths in result.items():
+        typer.echo(f"🎯 Found {len(paths)} {name.replace('_', ' ')}. Samples:")
+        for path in paths[:3]:
+            typer.echo(f"* {path}")
+
+    # metadata
+    meta_dfs = []
+    for meta_file in result["METADATA_FILES"]:
+        split_name = meta_file.stem
+
+        df = pd.read_csv(meta_file, index_col=0, header=0)
+
+        # 8336197103293617-CHAMP.mp4 becomes 8336197103293617
+        df[DatasetDFCol.VIDEO_ID] = df["Video file"].apply(lambda x: Path(x).stem.split("-")[0])
+
+        typer.echo(f"Found metadata file: {meta_file}")
+
+        df["SPLIT"] = split_name
+        df = df_to_standardized_df(
+            df,
+        )
+
+        meta_dfs.append(df)
+
+    df = pd.concat(meta_dfs)
+    typer.echo(f"Deduplicating by video ID and split, currently there are {len(df)} rows")
+    df = deduplicate_by_video_id(
+        df, video_id_col="VIDEO_ID", split_col="SPLIT", priority_order=["train", "val", "test"]
+    )
+    typer.echo(f"There are now {len(df)} rows")
+
+    for prefix in ["POSE", "VIDEO"]:
+
+        files_df = file_paths_list_to_df(result[f"{prefix}_FILES"], prefix=prefix)
+        files_df[DatasetDFCol.VIDEO_ID] = files_df[f"{prefix}_FILE_PATH"].apply(lambda x: Path(x).stem.split("-")[0])
+        # typer.echo(files_df.head())
+        typer.echo(f"Merging {len(files_df)} {prefix} files into df")
+        df = df.merge(files_df, on=DatasetDFCol.VIDEO_ID, how="left")
+    df = df_to_standardized_df(
+        df,
+        video_id_col=DatasetDFCol.VIDEO_ID,
+        split_col=DatasetDFCol.SPLIT,
+        gloss_col=DatasetDFCol.GLOSS,
+    )
+    typer.echo(df.info())
+    typer.echo(df.head())
+    if out is not None:
+        if out.name.endswith(".csv"):
+            df.to_csv(out, index=False)
+        if out.name.endswith(".json"):
+            df.to_json(out)
+
+
+if __name__ == "__main__":
+    app()
@@ -0,0 +1,125 @@
+from pathlib import Path
+import fnmatch
+from typing import Optional, List, Dict
+import typer
+
+app = typer.Typer()
+
+
+def parse_model_name_from_embedding_file(file_path: Path) -> str:
+    """
+    Extract the model name from a file name like:
+    "some-id-using-model-modelname_checkpoint_best.npy"
+    """
+    name = file_path.name
+    if "-using-model-" not in name:
+        raise ValueError(f"No model name found in file: {file_path}")
+    return name.split("-using-model-")[-1].removesuffix(".npy")
+
+
+def collect_files_once(
+    base: Path,
+    pattern_map: Dict[str, List[str]],
+) -> Dict[str, List[Path]]:
+    """Walk the directory once and classify files by matching patterns."""
+    result = {key: [] for key in pattern_map}
+    for f in base.rglob("*"):
+        if not f.is_file():
+            continue
+        for category, patterns in pattern_map.items():
+            if any(fnmatch.fnmatch(f.name, pattern) for pattern in patterns):
+                result[category].append(f)
+
+    # Sort all lists for consistency
+    for files in result.values():
+        files.sort()
+
+    for name, paths in result.items():
+        typer.echo(f"🎯 Found {len(paths)} {name.replace('_', ' ')}. Samples:")
+        for path in paths[:3]:
+            if name == "embedding":
+                try:
+                    model_name = parse_model_name_from_embedding_file(path)
+                    typer.echo(f"* {path} → model: {model_name}")
+                except ValueError as e:
+                    typer.echo(f"* {path} (⚠️ {e})")
+            else:
+                typer.echo(f"* {path}")
+    return result
+
+
+def collect_files_main(
+    dataset_path: Path,
+    pose_files_path: Optional[Path] = None,
+    metadata_path: Optional[Path] = None,
+    video_files_path: Optional[Path] = None,
+    embedding_files_path: Optional[Path] = None,
+    pose_patterns: Optional[List[str]] = None,
+    metadata_patterns: Optional[List[str]] = None,
+    video_patterns: Optional[List[str]] = None,
+    embedding_patterns: Optional[List[str]] = None,
+):
+    """Efficiently collect all files by walking each root directory only once."""
+    if pose_patterns is None:
+        pose_patterns = ["*.pose", "*.pose.zst"]
+    if metadata_patterns is None:
+        metadata_patterns = ["*.csv"]
+    if video_patterns is None:
+        video_patterns = ["*.mp4", "*.avi", "*.mov"]
+    if embedding_patterns is None:
+        embedding_patterns = ["*.npy"]
+
+    result = {}
+
+    search_roots = {
+        "pose": (pose_files_path or dataset_path, pose_patterns),
+        "metadata": (metadata_path or dataset_path, metadata_patterns),
+        "video": (video_files_path or dataset_path, video_patterns),
+        "embedding": (embedding_files_path or dataset_path, embedding_patterns),
+    }
+
+    # Group by root to avoid repeated walks
+    root_to_keys = {}
+    for key, (root, patterns) in search_roots.items():
+        if patterns is not None:
+            root_to_keys.setdefault(root, []).append((key, patterns))
+
+    for root, keys_and_patterns in root_to_keys.items():
+        pattern_map = dict(keys_and_patterns)
+        root_results = collect_files_once(root, pattern_map)
+        result.update({f"{key.upper()}_FILES": root_results[key] for key in pattern_map})
+
+    return result
+
+
+@app.command()
+def collect_files_cli(
+    dataset_path: Path = typer.Argument(..., exists=True, file_okay=False),
+    pose_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    metadata_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    video_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    embedding_files_path: Optional[Path] = typer.Option(None, exists=True, file_okay=False),
+    pose_patterns: List[str] = typer.Option(["*.pose", "*.pose.zst"]),
+    metadata_patterns: List[str] = typer.Option(["*.csv"]),
+    video_patterns: List[str] = typer.Option(["*.mp4", "*.avi", "*.mov"]),
+    embedding_patterns: List[str] = typer.Option(["*.npy"]),
+):
+    """CLI wrapper around collect_files_main"""
+    result = collect_files_main(
+        dataset_path=dataset_path,
+        pose_files_path=pose_files_path,
+        metadata_path=metadata_path,
+        video_files_path=video_files_path,
+        embedding_files_path=embedding_files_path,
+        pose_patterns=pose_patterns,
+        metadata_patterns=metadata_patterns,
+        video_patterns=video_patterns,
+        embedding_patterns=embedding_patterns,
+    )
+
+    for name, paths in result.items():
+        typer.echo(f"✅ Found {len(paths)} {name.replace('_', ' ')}")
+
+
+if __name__ == "__main__":
+    app()
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+
+from tqdm import tqdm
+
+from pose_evaluation.evaluation.score_dataframe_format import load_score_csv
+
+
+def convert_csvs_to_parquet(
+    folder: Path, out_dir: Path | None = None, remove_original: bool = False, score_csv_format=False
+):
+    csv_files = list(folder.glob("*.csv"))
+    if not csv_files:
+        print(f"No CSV files found in {folder}")
+        return
+
+    if score_csv_format:
+        print(f"Using score csv format")
+
+    out_dir = out_dir or folder
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    for csv_path in tqdm(csv_files, desc="Converting CSV files to Parquet"):
+
+        if score_csv_format:
+            df = load_score_csv(csv_path)
+        else:
+            df = pd.read_csv(csv_path)
+        parquet_path = out_dir / csv_path.with_suffix(".parquet").name
+        df.to_parquet(parquet_path, index=False)
+        # print(f"Converted: {csv_path.name} → {parquet_path}")
+        if remove_original:
+            csv_path.unlink()
+            print(f"Deleted original: {csv_path.name}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Convert CSV files in a folder to Parquet.")
+    parser.add_argument("folder", type=Path, help="Path to folder containing CSV files")
+    parser.add_argument("-o", "--out", type=Path, help="Optional output folder for Parquet files")
+    parser.add_argument("--remove", action="store_true", help="Remove original CSV files after conversion")
+    parser.add_argument(
+        "--not-score-files",
+        action="store_true",
+        help="By default, will load with specific score CSV columns, datatypes. If this is given, will just use pd.read_csv.",
+    )
+    args = parser.parse_args()
+
+    if not args.folder.is_dir():
+        print(f"Error: {args.folder} is not a directory.")
+        return
+    use_score_files_format = not args.not_score_files
+    convert_csvs_to_parquet(
+        folder=args.folder, out_dir=args.out, remove_original=args.remove, score_csv_format=use_score_files_format
+    )
+
+
+if __name__ == "__main__":
+    main()