-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranscribe.py
More file actions
116 lines (98 loc) · 4.28 KB
/
transcribe.py
File metadata and controls
116 lines (98 loc) · 4.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import sys
import whisperx
# Add local ffmpeg to path
os.environ["PATH"] = os.path.join(os.path.dirname(__file__), "whisperx-bundle", "ffmpeg") + ":" + os.environ.get("PATH", "")
def format_time(seconds):
"""Convert seconds to SRT time format (HH:MM:SS,mmm)"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
device = "cpu"
model_name = "medium"
compute_type = "int8"
if len(sys.argv) < 2:
print("❌ Usage: python transcribe.py <input_file> [output_dir] [--diarize] [--words-only]")
sys.exit(1)
audio_file = sys.argv[1]
output_dir = None
# Check if the second argument is a directory (and not a flag)
if len(sys.argv) > 2 and not sys.argv[2].startswith('--'):
output_dir = sys.argv[2]
args_offset = 1
else:
args_offset = 0
enable_diarization = "--diarize" in sys.argv
words_only = "--words-only" in sys.argv
# Generate output filenames
base_name = os.path.splitext(os.path.basename(audio_file))[0]
if output_dir:
os.makedirs(output_dir, exist_ok=True)
txt_file = os.path.join(output_dir, f"{base_name}_transcript.txt")
srt_file = os.path.join(output_dir, f"{base_name}_subtitles.srt")
else:
txt_file = f"{base_name}_transcript.txt"
srt_file = f"{base_name}_subtitles.srt"
print(f"🔊 Transcribing: {audio_file}")
model = whisperx.load_model(model_name, device=device, compute_type=compute_type)
result = model.transcribe(audio_file)
print("🧠 Aligning words...")
align_model, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
aligned = whisperx.align(result["segments"], align_model, metadata, audio_file, device=device)
if words_only:
print("📌 Word-level timestamps only:")
with open(txt_file, 'w', encoding='utf-8') as f:
for word in aligned["word_segments"]:
line = f"{word['start']:.2f} - {word['end']:.2f}: {word['word']}"
print(line)
f.write(line + '\n')
print(f"✅ Saved to: {txt_file}")
sys.exit(0)
if enable_diarization:
print("🧍 Performing speaker diarization...")
diarize_model = whisperx.DiarizationPipeline(use_auth_token=None, device=device)
diarize_segments = diarize_model(audio_file)
result_with_speakers = whisperx.assign_word_speakers(diarize_segments, aligned["word_segments"])
print("🗣️ Speaker-separated transcript:")
with open(txt_file, 'w', encoding='utf-8') as f:
for word in result_with_speakers:
line = f"[{word['speaker']}] {word['start']:.2f} - {word['end']:.2f}: {word['word']}"
print(line)
f.write(line + '\n')
# Create SRT file for diarization
with open(srt_file, 'w', encoding='utf-8') as f:
subtitle_num = 1
for word in result_with_speakers:
start_time = format_time(word['start'])
end_time = format_time(word['end'])
f.write(f"{subtitle_num}\n")
f.write(f"{start_time} --> {end_time}\n")
f.write(f"[{word['speaker']}] {word['word']}\n\n")
subtitle_num += 1
print(f"✅ Saved to: {txt_file} and {srt_file}")
else:
print("📝 Full transcript (with word-level alignment):")
# Save segment-level transcript to TXT
with open(txt_file, 'w', encoding='utf-8') as f:
f.write("=== SEGMENT-LEVEL TRANSCRIPT ===\n")
for segment in result["segments"]:
line = f"[{segment['start']:.2f} - {segment['end']:.2f}] {segment['text']}"
print(line)
f.write(line + '\n')
f.write("\n=== WORD-LEVEL TIMESTAMPS ===\n")
for word in aligned["word_segments"]:
line = f"{word['start']:.2f} - {word['end']:.2f}: {word['word']}"
print(line)
f.write(line + '\n')
# Create SRT file from segments
with open(srt_file, 'w', encoding='utf-8') as f:
subtitle_num = 1
for segment in result["segments"]:
start_time = format_time(segment['start'])
end_time = format_time(segment['end'])
f.write(f"{subtitle_num}\n")
f.write(f"{start_time} --> {end_time}\n")
f.write(f"{segment['text'].strip()}\n\n")
subtitle_num += 1
print(f"✅ Saved to: {txt_file} and {srt_file}")