Add files via upload

karmvirpadda · web-flow · commit d8586034a757 · 2025-04-26T04:07:54.000-04:00
diff --git a/podcasting-patriarchy/ManoWhisper_VTT_txt_and_timestamp_04_06_25.ipynb b/podcasting-patriarchy/ManoWhisper_VTT_txt_and_timestamp_04_06_25.ipynb
@@ -0,0 +1,159 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "This code is used to remove time-stampes from the podcast episodes and also convert the .vtt to .txt."
+      ],
+      "metadata": {
+        "id": "W5mSianHrUlf"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import re\n",
+        "import os\n",
+        "from google.colab import drive\n",
+        "\n",
+        "# Mount Google Drive\n",
+        "drive.mount('/content/drive')\n"
+      ],
+      "metadata": {
+        "id": "cXrOyycOnoZE"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "# Set your input/output folder paths\n",
+        "input_folder = \"/content/drive/MyDrive/IRS Paper/DATA/The Tucker Carlson Show/vtt\"       # change as needed\n",
+        "output_folder = \"/content/drive/MyDrive/IRS Paper/DATA/The Tucker Carlson Show/Cleaned_Text\"  # change as needed\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "eQNi5IQ7nwOY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Create output folder (reset if needed)\n",
+        "if os.path.exists(output_folder):\n",
+        "    # Optional: clear out old files\n",
+        "    for f in os.listdir(output_folder):\n",
+        "        os.remove(os.path.join(output_folder, f))\n",
+        "else:\n",
+        "    os.makedirs(output_folder)\n",
+        "\n",
+        "# Flexible timestamp pattern (handles mm:ss.sss and hh:mm:ss.sss)\n",
+        "timecode_pattern = re.compile(\n",
+        "    r'^\\d{2}:\\d{2}(?::\\d{2})?\\.\\d{3} --> \\d{2}:\\d{2}(?::\\d{2})?\\.\\d{3}$'\n",
+        ")\n",
+        "\n",
+        "# Process each VTT file\n",
+        "for filename in os.listdir(input_folder):\n",
+        "    if filename.endswith('.vtt'):\n",
+        "        input_path = os.path.join(input_folder, filename)\n",
+        "        output_path = os.path.join(output_folder, filename.replace('.vtt', '.txt'))\n",
+        "\n",
+        "        with open(input_path, 'r', encoding='utf-8') as file:\n",
+        "            lines = file.readlines()\n",
+        "\n",
+        "        cleaned_lines = []\n",
+        "        for line in lines:\n",
+        "            line = line.strip()\n",
+        "            if line and not line.startswith(('WEBVTT', 'NOTE')) and not timecode_pattern.match(line):\n",
+        "                cleaned_lines.append(line)\n",
+        "\n",
+        "        with open(output_path, 'w', encoding='utf-8') as file:\n",
+        "            file.write(\"\\n\".join(cleaned_lines))\n",
+        "\n",
+        "        print(f\"Cleaned: {filename} → {os.path.basename(output_path)}\")\n",
+        "\n",
+        "print(\"All files processed and saved to:\", output_folder)\n"
+      ],
+      "metadata": {
+        "id": "oiu7zzFKnyie"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Tracking stats\n",
+        "total_files = 0\n",
+        "total_lines_kept = 0\n",
+        "total_timestamps_removed = 0\n",
+        "\n",
+        "# Step 5 (Modified): Process each VTT file\n",
+        "for filename in os.listdir(input_folder):\n",
+        "    if filename.endswith('.vtt'):\n",
+        "        total_files += 1\n",
+        "        input_path = os.path.join(input_folder, filename)\n",
+        "        output_path = os.path.join(output_folder, filename.replace('.vtt', '.txt'))\n",
+        "\n",
+        "        with open(input_path, 'r', encoding='utf-8') as file:\n",
+        "            lines = file.readlines()\n",
+        "\n",
+        "        cleaned_lines = []\n",
+        "        for line in lines:\n",
+        "            line = line.strip()\n",
+        "            if not line:\n",
+        "                continue\n",
+        "            elif line.startswith(('WEBVTT', 'NOTE')) or timecode_pattern.match(line):\n",
+        "                if timecode_pattern.match(line):\n",
+        "                    total_timestamps_removed += 1\n",
+        "                continue\n",
+        "            else:\n",
+        "                cleaned_lines.append(line)\n",
+        "                total_lines_kept += 1\n",
+        "\n",
+        "        with open(output_path, 'w', encoding='utf-8') as file:\n",
+        "            file.write(\"\\n\".join(cleaned_lines))\n",
+        "\n",
+        "        print(f\" Cleaned: {filename} → {os.path.basename(output_path)}\")\n",
+        "\n",
+        "# Summary\n",
+        "print(\"\\n Cleaning Summary:\")\n",
+        "print(f\" Total files processed: {total_files}\")\n",
+        "print(f\" Timestamp lines removed: {total_timestamps_removed}\")\n",
+        "print(f\" Total lines kept: {total_lines_kept}\")\n"
+      ],
+      "metadata": {
+        "id": "lphHNDMnoWQw"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "hdNrD0mPrBMF"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}