ISL_Web/build_mapping_multiuser.py at main · AnushkaPandit-21/ISL_Web · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import re
import csv
from pathlib import Path

# 1) Reference folder (your 500 words videos)
REF_DIR = Path(r"/home/antpc/Downloads/5000_RTH_Videos (Copy)")  # <-- change if needed

# 2) Collected folders (your 22k videos)
# user002 has multiple paths, so we list them all.
USER_DIRS = {
    "001": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER001/All_clips_USer001_18-01-2026"),
    ],
    "002": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/download_2026-01-05_15-16-42/user_002_clips/output_clips"),
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/user002_output_18-01-2026/output_folder"),
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/user002_output_18-01-2026/Skipped_output_folder_user002"),
    ],
    "003": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER003/All_user003_clips_18-01-2026"),
    ],
    "004": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER004/output_clips_20-01-2026_user004"),
    ],
    "005": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER005/All_clips_19-01-2026"),
    ],
    "006": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER006/user006_output_5-02-2026/clips"),
    ],
    "007": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER007/user007_output_05-02-2026/chopped"),
    ],
    "008": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER008/user008/user_008_06-02-2026/output_user008_06-02-2026"),
    ],
    "009": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER009/User_009_06-02-2026/Clips"),
    ],
    "010": [
        Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER010/User010_06-02-2026/Clips"),
    ],
}

VIDEO_EXTS = {".mp4", ".mpg", ".mov", ".mkv", ".avi", ".webm"}
OUT_CSV = Path("mapping.csv")

def normalize_word(s: str) -> str:
    return s.strip().lower()

def word_from_reference(ref_file: Path) -> str:
    # reference filename is exactly the word: hello.mp4 -> hello
    return normalize_word(ref_file.stem)

def word_from_collected_filename(filename: str) -> str:
    """
    IMPORTANT: This is the only part you might need to tweak.

    Current rule:
    - Take the first continuous letters at the start of filename stem.
      Examples:
        hello_001.mp4 -> hello
        thankyou-user002-10.mp4 -> thankyou
        school12.mp4 -> school

    If your collected files are like:  001_hello_0001.mp4
    then we need a different rule.
    """
    stem = Path(filename).stem.lower()

    # remove common prefixes like user001_, u001_
    stem = re.sub(r"^(user|u)\d{1,3}[_-]*", "", stem)

    # CASE 1: word__session123__clip001  → word
    m = re.match(r"([a-z]+)__", stem)
    if m:
        return m.group(1)

    # CASE 2: something_word.mp4  → word (for user007)
    m = re.search(r"_([a-z]+)$", stem)
    if m:
        return m.group(1)

    # CASE 3: word__000001.mp4 → word (user009/010)
    m = re.match(r"([a-z]+)__", stem)
    if m:
        return m.group(1)

    # CASE 4: fallback → first letters at start
    m = re.match(r"([a-z]+)", stem)
    return m.group(1) if m else ""

def main():
    # Build reference lookup: word -> ref_path
    ref_lookup = {}
    for f in REF_DIR.iterdir():
        if f.is_file() and f.suffix.lower() in VIDEO_EXTS:
            w = word_from_reference(f)
            ref_lookup[w] = str(f)

    if not ref_lookup:
        raise RuntimeError(f"No reference videos found in: {REF_DIR}")

    rows = []
    skipped_no_word = 0
    skipped_no_ref = 0
    total_collected = 0

    for user_id, roots in USER_DIRS.items():
        for root in roots:
            if not root.exists():
                print(f"⚠️ Missing folder for user{user_id}: {root}")
                continue

            for vid in root.rglob("*"):
                if not vid.is_file() or vid.suffix.lower() not in VIDEO_EXTS:
                    continue

                total_collected += 1
                w = word_from_collected_filename(vid.name)
                if not w:
                    skipped_no_word += 1
                    continue

                ref_path = ref_lookup.get(w)
                if not ref_path:
                    skipped_no_ref += 1
                    continue

                rows.append([w, ref_path, str(vid), user_id])

    rows.sort(key=lambda r: (r[0], r[3], r[2]))

    with OUT_CSV.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["word", "reference_path", "collected_path", "user_id"])
        writer.writerows(rows)

    print("\n====== SUMMARY ======")
    print(f"Reference words found: {len(ref_lookup)}")
    print(f"Collected videos scanned: {total_collected}")
    print(f"Rows written to mapping.csv: {len(rows)}")
    print(f"Skipped (couldn't extract word from filename): {skipped_no_word}")
    print(f"Skipped (word not found in reference list): {skipped_no_ref}")
    print("=====================\n")
    print("Next: open mapping.csv and verify a few rows look correct.")

if __name__ == "__main__":
    main()