forked from Mobile-and-Wearable-Sensing-Lab/ISL_Web
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_mapping_multiuser.py
More file actions
149 lines (124 loc) · 5.08 KB
/
build_mapping_multiuser.py
File metadata and controls
149 lines (124 loc) · 5.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import re
import csv
from pathlib import Path
# 1) Reference folder (your 500 words videos)
REF_DIR = Path(r"/home/antpc/Downloads/5000_RTH_Videos (Copy)") # <-- change if needed
# 2) Collected folders (your 22k videos)
# user002 has multiple paths, so we list them all.
USER_DIRS = {
"001": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER001/All_clips_USer001_18-01-2026"),
],
"002": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/download_2026-01-05_15-16-42/user_002_clips/output_clips"),
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/user002_output_18-01-2026/output_folder"),
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER002/user002_output_18-01-2026/Skipped_output_folder_user002"),
],
"003": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER003/All_user003_clips_18-01-2026"),
],
"004": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER004/output_clips_20-01-2026_user004"),
],
"005": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER005/All_clips_19-01-2026"),
],
"006": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER006/user006_output_5-02-2026/clips"),
],
"007": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER007/user007_output_05-02-2026/chopped"),
],
"008": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER008/user008/user_008_06-02-2026/output_user008_06-02-2026"),
],
"009": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER009/User_009_06-02-2026/Clips"),
],
"010": [
Path(r"/mnt/9a528fe4-4fe8-4dff-9a0c-8b1a3cf3d7ba/ISL_DATA_USER010/User010_06-02-2026/Clips"),
],
}
VIDEO_EXTS = {".mp4", ".mpg", ".mov", ".mkv", ".avi", ".webm"}
OUT_CSV = Path("mapping.csv")
def normalize_word(s: str) -> str:
return s.strip().lower()
def word_from_reference(ref_file: Path) -> str:
# reference filename is exactly the word: hello.mp4 -> hello
return normalize_word(ref_file.stem)
def word_from_collected_filename(filename: str) -> str:
"""
IMPORTANT: This is the only part you might need to tweak.
Current rule:
- Take the first continuous letters at the start of filename stem.
Examples:
hello_001.mp4 -> hello
thankyou-user002-10.mp4 -> thankyou
school12.mp4 -> school
If your collected files are like: 001_hello_0001.mp4
then we need a different rule.
"""
stem = Path(filename).stem.lower()
# remove common prefixes like user001_, u001_
stem = re.sub(r"^(user|u)\d{1,3}[_-]*", "", stem)
# CASE 1: word__session123__clip001 → word
m = re.match(r"([a-z]+)__", stem)
if m:
return m.group(1)
# CASE 2: something_word.mp4 → word (for user007)
m = re.search(r"_([a-z]+)$", stem)
if m:
return m.group(1)
# CASE 3: word__000001.mp4 → word (user009/010)
m = re.match(r"([a-z]+)__", stem)
if m:
return m.group(1)
# CASE 4: fallback → first letters at start
m = re.match(r"([a-z]+)", stem)
return m.group(1) if m else ""
def main():
# Build reference lookup: word -> ref_path
ref_lookup = {}
for f in REF_DIR.iterdir():
if f.is_file() and f.suffix.lower() in VIDEO_EXTS:
w = word_from_reference(f)
ref_lookup[w] = str(f)
if not ref_lookup:
raise RuntimeError(f"No reference videos found in: {REF_DIR}")
rows = []
skipped_no_word = 0
skipped_no_ref = 0
total_collected = 0
for user_id, roots in USER_DIRS.items():
for root in roots:
if not root.exists():
print(f"⚠️ Missing folder for user{user_id}: {root}")
continue
for vid in root.rglob("*"):
if not vid.is_file() or vid.suffix.lower() not in VIDEO_EXTS:
continue
total_collected += 1
w = word_from_collected_filename(vid.name)
if not w:
skipped_no_word += 1
continue
ref_path = ref_lookup.get(w)
if not ref_path:
skipped_no_ref += 1
continue
rows.append([w, ref_path, str(vid), user_id])
rows.sort(key=lambda r: (r[0], r[3], r[2]))
with OUT_CSV.open("w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["word", "reference_path", "collected_path", "user_id"])
writer.writerows(rows)
print("\n====== SUMMARY ======")
print(f"Reference words found: {len(ref_lookup)}")
print(f"Collected videos scanned: {total_collected}")
print(f"Rows written to mapping.csv: {len(rows)}")
print(f"Skipped (couldn't extract word from filename): {skipped_no_word}")
print(f"Skipped (word not found in reference list): {skipped_no_ref}")
print("=====================\n")
print("Next: open mapping.csv and verify a few rows look correct.")
if __name__ == "__main__":
main()