generative-ai-for-beginners/08-building-search-applications/scripts/transcript_download.py at d3ba7d315fb6041b1dc89a5af1ef16871e3ee1ae · vepretski/generative-ai-for-beginners · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
""" This script downloads the transcripts for all the videos in a YouTube playlist. """

import os
import json
import logging
import time
import threading
import argparse
import queue
import googleapiclient.discovery
import googleapiclient.errors
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import WebVTTFormatter


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

GOOGLE_DEVELOPER_API_KEY = os.environ["GOOGLE_DEVELOPER_API_KEY"]
TRANSCRIPT_FOLDER = "transcripts"

# Initialize the Google developer API client
GOOGLE_API_SERVICE_NAME = "youtube"
GOOGLE_API_VERSION = "v3"

MAX_RESULTS = 50
PROCESSING_THREADS = 40

formatter = WebVTTFormatter()
q = queue.Queue()

parser = argparse.ArgumentParser()
parser.add_argument("-f", "--folder")
parser.add_argument("-p", "--playlist")
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
if args.verbose:
    logger.setLevel(logging.DEBUG)

TRANSCRIPT_FOLDER = args.folder if args.folder else None
PLAYLIST_ID = args.playlist if args.playlist else None

if not TRANSCRIPT_FOLDER:
    logger.error("Transcript folder not provided")
    exit(1)

if not PLAYLIST_ID:
    logger.error("Playlist ID not provided")
    exit(1)


class Counter:
    """thread safe counter"""

    def __init__(self):
        """initialize the counter"""
        self.value = 0
        self.lock = threading.Lock()

    def increment(self):
        """increment the counter"""
        with self.lock:
            self.value += 1


counter = Counter()


def gen_metadata(playlist_item):
    """Generate metadata for a video"""

    video_id = playlist_item["snippet"]["resourceId"]["videoId"]
    filename = os.path.join(TRANSCRIPT_FOLDER, video_id + ".json")

    metadata = {}
    metadata["speaker"] = ""
    metadata["title"] = playlist_item["snippet"]["title"]
    metadata["videoId"] = playlist_item["snippet"]["resourceId"]["videoId"]
    metadata["description"] = playlist_item["snippet"]["description"]

    # save the metadata as a .json file
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(metadata, f)


def get_transcript(playlist_item, counter_id):
    """Get the transcript for a video"""

    video_id = playlist_item["snippet"]["resourceId"]["videoId"]
    filename = os.path.join(TRANSCRIPT_FOLDER, video_id + ".json.vtt")

    # if video transcript already exists, skip it
    if os.path.exists(filename):
        logger.debug("Skipping video %d, %s", counter_id, video_id)
        return False

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # remove \n from the text
        for item in transcript:
            item["text"] = item["text"].replace("\n", " ")

        logger.debug("Transcription download completed: %d, %s", counter_id, video_id)
        # save the transcript as a .vtt file
        with open(filename, "w", encoding="utf-8") as file:
            json.dump(transcript, file, indent=4, ensure_ascii=False)
            # file.write(transcript)

    except Exception as exception:
        logger.debug(exception)
        logger.debug("Transcription not found for video: %s", video_id)
        return False

    return True


def process_queue():
    """process the queue"""
    while not q.empty():
        video = q.get()

        counter.increment()

        if get_transcript(video, counter.value):
            gen_metadata(video)
        q.task_done()


logger.debug("Transcription folder: %s", TRANSCRIPT_FOLDER)

youtube = googleapiclient.discovery.build(
    GOOGLE_API_SERVICE_NAME, GOOGLE_API_VERSION, developerKey=GOOGLE_DEVELOPER_API_KEY
)

# Create a request object with the playlist ID and the max results
request = youtube.playlistItems().list(
    part="snippet", playlistId=PLAYLIST_ID, maxResults=MAX_RESULTS
)


# Loop through the pages of results until there is no next page token
while request:
    # Execute the request and get the response
    response = request.execute()

    # Iterate over the items in the response and append the video IDs to the list
    for item in response["items"]:
        q.put(item)

    # Get the next page token from the response and create a new request object
    next_page_token = response.get("nextPageToken")
    if next_page_token:
        request = youtube.playlistItems().list(
            part="snippet",
            playlistId=PLAYLIST_ID,
            maxResults=MAX_RESULTS,
            pageToken=next_page_token,
        )
    else:
        request = None

    logger.info("Total transcriptions to be download: %s", q.qsize())

start_time = time.time()

# create multiple threads to process the queue
threads = []
for i in range(PROCESSING_THREADS):
    t = threading.Thread(
        target=process_queue,
        args=(),
    )
    t.start()
    threads.append(t)

# wait for all threads to finish
for t in threads:
    t.join()


finish_time = time.time()
logger.debug("Total time taken: %s", finish_time - start_time)