Skip to content

Commit d858603

Browse files
authored
Add files via upload
1 parent d2be5ed commit d858603

1 file changed

Lines changed: 159 additions & 0 deletions

File tree

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "markdown",
19+
"source": [
20+
"This code is used to remove time-stampes from the podcast episodes and also convert the .vtt to .txt."
21+
],
22+
"metadata": {
23+
"id": "W5mSianHrUlf"
24+
}
25+
},
26+
{
27+
"cell_type": "code",
28+
"source": [
29+
"import re\n",
30+
"import os\n",
31+
"from google.colab import drive\n",
32+
"\n",
33+
"# Mount Google Drive\n",
34+
"drive.mount('/content/drive')\n"
35+
],
36+
"metadata": {
37+
"id": "cXrOyycOnoZE"
38+
},
39+
"execution_count": null,
40+
"outputs": []
41+
},
42+
{
43+
"cell_type": "code",
44+
"source": [
45+
"\n",
46+
"# Set your input/output folder paths\n",
47+
"input_folder = \"/content/drive/MyDrive/IRS Paper/DATA/The Tucker Carlson Show/vtt\" # change as needed\n",
48+
"output_folder = \"/content/drive/MyDrive/IRS Paper/DATA/The Tucker Carlson Show/Cleaned_Text\" # change as needed\n",
49+
"\n"
50+
],
51+
"metadata": {
52+
"id": "eQNi5IQ7nwOY"
53+
},
54+
"execution_count": null,
55+
"outputs": []
56+
},
57+
{
58+
"cell_type": "code",
59+
"source": [
60+
"# Create output folder (reset if needed)\n",
61+
"if os.path.exists(output_folder):\n",
62+
" # Optional: clear out old files\n",
63+
" for f in os.listdir(output_folder):\n",
64+
" os.remove(os.path.join(output_folder, f))\n",
65+
"else:\n",
66+
" os.makedirs(output_folder)\n",
67+
"\n",
68+
"# Flexible timestamp pattern (handles mm:ss.sss and hh:mm:ss.sss)\n",
69+
"timecode_pattern = re.compile(\n",
70+
" r'^\\d{2}:\\d{2}(?::\\d{2})?\\.\\d{3} --> \\d{2}:\\d{2}(?::\\d{2})?\\.\\d{3}$'\n",
71+
")\n",
72+
"\n",
73+
"# Process each VTT file\n",
74+
"for filename in os.listdir(input_folder):\n",
75+
" if filename.endswith('.vtt'):\n",
76+
" input_path = os.path.join(input_folder, filename)\n",
77+
" output_path = os.path.join(output_folder, filename.replace('.vtt', '.txt'))\n",
78+
"\n",
79+
" with open(input_path, 'r', encoding='utf-8') as file:\n",
80+
" lines = file.readlines()\n",
81+
"\n",
82+
" cleaned_lines = []\n",
83+
" for line in lines:\n",
84+
" line = line.strip()\n",
85+
" if line and not line.startswith(('WEBVTT', 'NOTE')) and not timecode_pattern.match(line):\n",
86+
" cleaned_lines.append(line)\n",
87+
"\n",
88+
" with open(output_path, 'w', encoding='utf-8') as file:\n",
89+
" file.write(\"\\n\".join(cleaned_lines))\n",
90+
"\n",
91+
" print(f\"Cleaned: {filename} → {os.path.basename(output_path)}\")\n",
92+
"\n",
93+
"print(\"All files processed and saved to:\", output_folder)\n"
94+
],
95+
"metadata": {
96+
"id": "oiu7zzFKnyie"
97+
},
98+
"execution_count": null,
99+
"outputs": []
100+
},
101+
{
102+
"cell_type": "code",
103+
"source": [
104+
"# Tracking stats\n",
105+
"total_files = 0\n",
106+
"total_lines_kept = 0\n",
107+
"total_timestamps_removed = 0\n",
108+
"\n",
109+
"# Step 5 (Modified): Process each VTT file\n",
110+
"for filename in os.listdir(input_folder):\n",
111+
" if filename.endswith('.vtt'):\n",
112+
" total_files += 1\n",
113+
" input_path = os.path.join(input_folder, filename)\n",
114+
" output_path = os.path.join(output_folder, filename.replace('.vtt', '.txt'))\n",
115+
"\n",
116+
" with open(input_path, 'r', encoding='utf-8') as file:\n",
117+
" lines = file.readlines()\n",
118+
"\n",
119+
" cleaned_lines = []\n",
120+
" for line in lines:\n",
121+
" line = line.strip()\n",
122+
" if not line:\n",
123+
" continue\n",
124+
" elif line.startswith(('WEBVTT', 'NOTE')) or timecode_pattern.match(line):\n",
125+
" if timecode_pattern.match(line):\n",
126+
" total_timestamps_removed += 1\n",
127+
" continue\n",
128+
" else:\n",
129+
" cleaned_lines.append(line)\n",
130+
" total_lines_kept += 1\n",
131+
"\n",
132+
" with open(output_path, 'w', encoding='utf-8') as file:\n",
133+
" file.write(\"\\n\".join(cleaned_lines))\n",
134+
"\n",
135+
" print(f\" Cleaned: {filename} → {os.path.basename(output_path)}\")\n",
136+
"\n",
137+
"# Summary\n",
138+
"print(\"\\n Cleaning Summary:\")\n",
139+
"print(f\" Total files processed: {total_files}\")\n",
140+
"print(f\" Timestamp lines removed: {total_timestamps_removed}\")\n",
141+
"print(f\" Total lines kept: {total_lines_kept}\")\n"
142+
],
143+
"metadata": {
144+
"id": "lphHNDMnoWQw"
145+
},
146+
"execution_count": null,
147+
"outputs": []
148+
},
149+
{
150+
"cell_type": "code",
151+
"source": [],
152+
"metadata": {
153+
"id": "hdNrD0mPrBMF"
154+
},
155+
"execution_count": null,
156+
"outputs": []
157+
}
158+
]
159+
}

0 commit comments

Comments
 (0)