1+ {
2+ "nbformat" : 4 ,
3+ "nbformat_minor" : 0 ,
4+ "metadata" : {
5+ "colab" : {
6+ "provenance" : []
7+ },
8+ "kernelspec" : {
9+ "name" : " python3" ,
10+ "display_name" : " Python 3"
11+ },
12+ "language_info" : {
13+ "name" : " python"
14+ }
15+ },
16+ "cells" : [
17+ {
18+ "cell_type" : " markdown" ,
19+ "source" : [
20+ " This code is used to remove time-stampes from the podcast episodes and also convert the .vtt to .txt."
21+ ],
22+ "metadata" : {
23+ "id" : " W5mSianHrUlf"
24+ }
25+ },
26+ {
27+ "cell_type" : " code" ,
28+ "source" : [
29+ " import re\n " ,
30+ " import os\n " ,
31+ " from google.colab import drive\n " ,
32+ " \n " ,
33+ " # Mount Google Drive\n " ,
34+ " drive.mount('/content/drive')\n "
35+ ],
36+ "metadata" : {
37+ "id" : " cXrOyycOnoZE"
38+ },
39+ "execution_count" : null ,
40+ "outputs" : []
41+ },
42+ {
43+ "cell_type" : " code" ,
44+ "source" : [
45+ " \n " ,
46+ " # Set your input/output folder paths\n " ,
47+ " input_folder = \" /content/drive/MyDrive/IRS Paper/DATA/The Tucker Carlson Show/vtt\" # change as needed\n " ,
48+ " output_folder = \" /content/drive/MyDrive/IRS Paper/DATA/The Tucker Carlson Show/Cleaned_Text\" # change as needed\n " ,
49+ " \n "
50+ ],
51+ "metadata" : {
52+ "id" : " eQNi5IQ7nwOY"
53+ },
54+ "execution_count" : null ,
55+ "outputs" : []
56+ },
57+ {
58+ "cell_type" : " code" ,
59+ "source" : [
60+ " # Create output folder (reset if needed)\n " ,
61+ " if os.path.exists(output_folder):\n " ,
62+ " # Optional: clear out old files\n " ,
63+ " for f in os.listdir(output_folder):\n " ,
64+ " os.remove(os.path.join(output_folder, f))\n " ,
65+ " else:\n " ,
66+ " os.makedirs(output_folder)\n " ,
67+ " \n " ,
68+ " # Flexible timestamp pattern (handles mm:ss.sss and hh:mm:ss.sss)\n " ,
69+ " timecode_pattern = re.compile(\n " ,
70+ " r'^\\ d{2}:\\ d{2}(?::\\ d{2})?\\ .\\ d{3} --> \\ d{2}:\\ d{2}(?::\\ d{2})?\\ .\\ d{3}$'\n " ,
71+ " )\n " ,
72+ " \n " ,
73+ " # Process each VTT file\n " ,
74+ " for filename in os.listdir(input_folder):\n " ,
75+ " if filename.endswith('.vtt'):\n " ,
76+ " input_path = os.path.join(input_folder, filename)\n " ,
77+ " output_path = os.path.join(output_folder, filename.replace('.vtt', '.txt'))\n " ,
78+ " \n " ,
79+ " with open(input_path, 'r', encoding='utf-8') as file:\n " ,
80+ " lines = file.readlines()\n " ,
81+ " \n " ,
82+ " cleaned_lines = []\n " ,
83+ " for line in lines:\n " ,
84+ " line = line.strip()\n " ,
85+ " if line and not line.startswith(('WEBVTT', 'NOTE')) and not timecode_pattern.match(line):\n " ,
86+ " cleaned_lines.append(line)\n " ,
87+ " \n " ,
88+ " with open(output_path, 'w', encoding='utf-8') as file:\n " ,
89+ " file.write(\"\\ n\" .join(cleaned_lines))\n " ,
90+ " \n " ,
91+ " print(f\" Cleaned: {filename} → {os.path.basename(output_path)}\" )\n " ,
92+ " \n " ,
93+ " print(\" All files processed and saved to:\" , output_folder)\n "
94+ ],
95+ "metadata" : {
96+ "id" : " oiu7zzFKnyie"
97+ },
98+ "execution_count" : null ,
99+ "outputs" : []
100+ },
101+ {
102+ "cell_type" : " code" ,
103+ "source" : [
104+ " # Tracking stats\n " ,
105+ " total_files = 0\n " ,
106+ " total_lines_kept = 0\n " ,
107+ " total_timestamps_removed = 0\n " ,
108+ " \n " ,
109+ " # Step 5 (Modified): Process each VTT file\n " ,
110+ " for filename in os.listdir(input_folder):\n " ,
111+ " if filename.endswith('.vtt'):\n " ,
112+ " total_files += 1\n " ,
113+ " input_path = os.path.join(input_folder, filename)\n " ,
114+ " output_path = os.path.join(output_folder, filename.replace('.vtt', '.txt'))\n " ,
115+ " \n " ,
116+ " with open(input_path, 'r', encoding='utf-8') as file:\n " ,
117+ " lines = file.readlines()\n " ,
118+ " \n " ,
119+ " cleaned_lines = []\n " ,
120+ " for line in lines:\n " ,
121+ " line = line.strip()\n " ,
122+ " if not line:\n " ,
123+ " continue\n " ,
124+ " elif line.startswith(('WEBVTT', 'NOTE')) or timecode_pattern.match(line):\n " ,
125+ " if timecode_pattern.match(line):\n " ,
126+ " total_timestamps_removed += 1\n " ,
127+ " continue\n " ,
128+ " else:\n " ,
129+ " cleaned_lines.append(line)\n " ,
130+ " total_lines_kept += 1\n " ,
131+ " \n " ,
132+ " with open(output_path, 'w', encoding='utf-8') as file:\n " ,
133+ " file.write(\"\\ n\" .join(cleaned_lines))\n " ,
134+ " \n " ,
135+ " print(f\" Cleaned: {filename} → {os.path.basename(output_path)}\" )\n " ,
136+ " \n " ,
137+ " # Summary\n " ,
138+ " print(\"\\ n Cleaning Summary:\" )\n " ,
139+ " print(f\" Total files processed: {total_files}\" )\n " ,
140+ " print(f\" Timestamp lines removed: {total_timestamps_removed}\" )\n " ,
141+ " print(f\" Total lines kept: {total_lines_kept}\" )\n "
142+ ],
143+ "metadata" : {
144+ "id" : " lphHNDMnoWQw"
145+ },
146+ "execution_count" : null ,
147+ "outputs" : []
148+ },
149+ {
150+ "cell_type" : " code" ,
151+ "source" : [],
152+ "metadata" : {
153+ "id" : " hdNrD0mPrBMF"
154+ },
155+ "execution_count" : null ,
156+ "outputs" : []
157+ }
158+ ]
159+ }
0 commit comments