-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_transcript.py
More file actions
159 lines (128 loc) · 5.51 KB
/
get_transcript.py
File metadata and controls
159 lines (128 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import json
import os
import argparse
import re
def extract_video_id(url):
# Regex for standard YouTube URLs
regex = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
match = re.search(regex, url)
if match:
return match.group(1)
return url # Return original if no match, might be just the ID
import json
import os
import argparse
import re
import subprocess
import sys
def extract_video_id(url):
# Regex for standard YouTube URLs
regex = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
match = re.search(regex, url)
if match:
return match.group(1)
return url # Return original if no match, might be just the ID
def parse_vtt(vtt_path):
"""Parses a VTT file and returns a list of (timestamp, text) tuples."""
entries = []
# Regex to match VTT timestamps: 00:00:00.000 or 00:00.000
timestamp_pattern = re.compile(r'((?:\d{2}:)?\d{2}:\d{2}\.\d{3})\s-->')
try:
with open(vtt_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
current_timestamp = None
current_text = []
for line in lines:
line = line.strip()
if not line:
# End of an entry
if current_timestamp and current_text:
text = " ".join(current_text)
# Clean up VTT formatting tags like <c> or <00:00:00.000>
text = re.sub(r'<[^>]+>', '', text)
if text:
entries.append((current_timestamp, text))
current_timestamp = None
current_text = []
continue
if line.startswith('WEBVTT') or line.startswith('Kind:') or line.startswith('Language:'):
continue
# Check for timestamp line
ts_match = timestamp_pattern.match(line)
if ts_match:
current_timestamp = ts_match.group(1).split('.')[0] # Take just the HH:MM:SS part
continue
# If we have a timestamp, this must be text content
if current_timestamp:
current_text.append(line)
# Handle last entry
if current_timestamp and current_text:
text = " ".join(current_text)
text = re.sub(r'<[^>]+>', '', text)
if text:
entries.append((current_timestamp, text))
except Exception as e:
print(f"Error parsing VTT: {e}")
return entries
def get_transcript(video_url, output_format="text"):
video_id = extract_video_id(video_url)
print(f"Video ID: {video_id}")
url = f"https://www.youtube.com/watch?v={video_id}"
# Construct the command
# We use the yt-dlp installed in the same environment as this script
yt_dlp_path = os.path.join(os.path.dirname(sys.executable), 'yt-dlp')
if not os.path.exists(yt_dlp_path):
# Fallback to just "yt-dlp" if not found in venv bin
yt_dlp_path = "yt-dlp"
# Command: yt-dlp --write-auto-sub --skip-download --sub-lang en --output "transcript_%(id)s" <url>
cmd = [
yt_dlp_path,
'--write-sub',
'--write-auto-sub',
'--skip-download',
'--sub-lang', 'en',
'--output', f'transcript_{video_id}',
url
]
print(f"Running command: {' '.join(cmd)}")
try:
subprocess.run(cmd, check=True)
# Check specifically for the .en.vtt file (yt-dlp standard naming with --output)
# With --output "transcript_ID", it usually appends .en.vtt or .en.ext
expected_filename = f"transcript_{video_id}.en.vtt"
if os.path.exists(expected_filename):
print(f"\nTranscript downloaded to: {expected_filename}")
if output_format == "text":
entries = parse_vtt(expected_filename)
output_txt_file = f"transcript_{video_id}.txt"
with open(output_txt_file, "w", encoding='utf-8') as f:
for timestamp, text in entries:
line = f"[{timestamp}] {text}"
print(line)
f.write(line + "\n")
print(f"\nFull text transcript saved to: {output_txt_file}")
else:
# Raw VTT output
print("\nFirst 10 lines of content:")
try:
with open(expected_filename, 'r', encoding='utf-8') as f:
for i, line in enumerate(f):
if i < 10:
print(line.rstrip())
except Exception as e:
print(f"Could not read file: {e}")
else:
print("Transcript file not found after download.")
# Debug listing
print("Files in directory:")
subprocess.run(['ls', '-F'], check=False)
except subprocess.CalledProcessError as e:
print(f"Error running yt-dlp: {e}")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Fetch YouTube transcript.')
parser.add_argument('url', help='YouTube video URL or ID')
parser.add_argument('--format', choices=['vtt', 'text'], default='text', help='Output format: vtt (raw) or text (timestamped lines)')
args = parser.parse_args()
get_transcript(args.url, output_format=args.format)