yttr-search/yttrsch.py at main · 4ngel2769/yttr-search · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
################################################
# YTTS - YouTube Transcript Search Tool
# Copyright (c) 2025 angeldev0
# Code written by angeldev0
# License: MIT
################################################

import argparse
import os
import re
import sys
# import time
import isodate
from urllib.parse import urlparse
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
from googleapiclient.discovery import build

BLUE      = '\033[94m'
YELLOW    = '\033[93m'
BG_LIME   = '\033[102m'
BG_CYAN   = '\033[46m'
BG_RED    = '\033[41m'
BG_GREEN  = '\033[42m'
BG_PURPLE = '\033[45m'
RED       = '\033[31m'
MAGENTA   = '\033[35m'
CYAN      = '\033[36m'
GREEN     = '\033[32m'
BLACK     = '\033[30m'
WHITE     = '\033[37m'
RESET     = '\033[0m'

KEY_COLORS = [BG_CYAN, BG_LIME, BG_RED, BG_GREEN]

load_dotenv()

def clear_line():
    """Erase the current line."""
    sys.stdout.write('\r\033[K')
    sys.stdout.flush()

def update_progress(current, total, width=40, prefix="Processing"):
    """Prints an in-place progress bar that stays at the bottom."""
    pct = current / total
    filled = int(pct * width)
    bar = f"[{'=' * filled}{' ' * (width - filled)}]"

    sys.stdout.write(f"\r{BG_PURPLE}{BLACK}{prefix} {bar} "
                     f"{current}/{total} ({pct:.0%}){RESET}")
    sys.stdout.flush()
    if current == total:
        sys.stdout.write(f"  {BG_GREEN}{BLACK} Done!{RESET}\n")
        sys.stdout.flush()

def extract_video_id(url):
    m = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11})(?:\&|$)', url)
    return m.group(1) if m else None

def parse_channel_input(youtube, raw):
    """
    Accepts:
      • raw channel ID (UC…)
      • bare handle (@name)
      • full channel URL or @handle URL
    and returns the proper UC… ID.
    """
    # bare handle
    if raw.startswith("@"):
        return resolve_handle(youtube, raw[1:])
    if raw.startswith("http"):
        p = urlparse(raw)
        parts = p.path.strip("/").split("/")
        # /@handle
        if parts[0].startswith("@"):
            return resolve_handle(youtube, parts[0][1:])
        # /channel/UC
        if parts[0] == "channel" and len(parts) > 1:
            return parts[1]
        # fallback path
        return resolve_handle(youtube, parts[-1])
    # assume it's an ID
    return raw

def resolve_handle(youtube, name):
    """
    Try channels().list(forUsername=…) then search().list(type=channel, q=…)
    """
    # old username
    res = youtube.channels().list(part="id", forUsername=name).execute()
    items = res.get("items", [])
    if items:
        return items[0]["id"]
    # fallback search
    res = youtube.search().list(
        part="id", type="channel", q=name, maxResults=1
    ).execute()
    items = res.get("items", [])
    if items:
        return items[0]["id"]["channelId"]
    raise ValueError(f"Cannot resolve channel identifier: {name}")

def parse_max_videos(val):
    """Parse human-friendly video count (e.g. 1.3k, 2m) into integer."""
    v = val.lower().replace(',', '').strip()
    if v.endswith('k'):
        return int(float(v[:-1]) * 1_000)
    if v.endswith('m'):
        return int(float(v[:-1]) * 1_000_000)
    return int(v)

def get_video_ids_from_channel(youtube, channel_id, sort_order='newest', max_videos=None):
    """
    Fetch video IDs from a channel with optional sort and limit.
    sort_order: 'newest', 'oldest', or 'popular'
    max_videos: int or None
    """
    ids = []
    # API order param
    order_param = 'viewCount' if sort_order == 'popular' else 'date'
    req = youtube.search().list(
        part="id", channelId=channel_id,
        maxResults=50, type="video",
        order=order_param
    )
    while True:
        res = req.execute()
        batch = [item['id']['videoId'] for item in res.get('items', [])]
        ids.extend(batch)
        if max_videos and len(ids) >= max_videos:
            ids = ids[:max_videos]
            break
        token = res.get('nextPageToken')
        if not token:
            break
        req = youtube.search().list(
            part="id", channelId=channel_id,
            maxResults=50, pageToken=token,
            type="video", order=order_param
        )
    if sort_order == 'oldest':
        ids.reverse()
    return ids

def fetch_transcription_segments(video_id):
    try:
        return YouTubeTranscriptApi.get_transcript(video_id), None
    except (TranscriptsDisabled, NoTranscriptFound) as e:
        return None, f"[{video_id}] no transcript: {e}"
    except Exception as e:
        return None, f"[{video_id}] error: {e}"

def format_timestamp(sec):
    m, s = divmod(int(sec), 60)
    return f"{m:02d}:{s:02d}"

def highlight(text, keywords):
    """
    Highlight each keyword with a lime background and its own color.
    """
    for idx, kw in enumerate(keywords):
        color = KEY_COLORS[idx % len(KEY_COLORS)]
        pattern = re.compile(f"(?i)({re.escape(kw)})")
        text = pattern.sub(f"{color}{BLACK}\\1{RESET}", text)
    return text

def get_video_title(youtube, video_id):
    try:
        resp = youtube.videos().list(part="snippet", id=video_id).execute()
        items = resp.get("items", [])
        if items:
            return items[0]["snippet"]["title"]
    except:
        pass
    return "Unknown Title"

class VersionAction(argparse.Action):
    def __init__(self, option_strings, dest=argparse.SUPPRESS, default=argparse.SUPPRESS, version="1.0.0", help="Show version and exit"):
        super().__init__(option_strings=option_strings, dest=dest, default=default, nargs=0, help=help)
        self.version = version

    def __call__(self, parser, namespace, values, option_string=None):
        msg = f"""
YTTS - YouTube Transcript Search Tool
Version: {self.version}
Author: angeldev0
License: MIT
"""
        print(msg)
        parser.exit()

def parse_length_expr(expr: str):
    """Parse '+5m' or '-2h' (or '30') into (op, seconds)."""
    m = re.match(r'([+-])(\d+(\.\d+)?)([smh])?$', expr.strip())
    if not m:
        raise argparse.ArgumentTypeError(f"Bad length filter: {expr!r}")
    sign, num, _, unit = m.groups()
    op = '>' if sign == '+' else '<'
    num = float(num)
    seconds = num * {'s':1, 'm':60, 'h':3600}[unit or 's']
    return op, seconds

def fetch_video_durations(youtube, vids:list[str]) -> dict[str,int]:
    """Batch-fetch each video’s duration in seconds via contentDetails."""
    durations = {}
    for i in range(0, len(vids), 50):
        batch = vids[i:i+50]
        resp = youtube.videos().list(
            part="contentDetails",
            id=",".join(batch)
        ).execute()
        for item in resp.get("items", []):
            vid = item["id"]
            iso = item["contentDetails"]["duration"]
            durations[vid] = int(isodate.parse_duration(iso).total_seconds())
    return durations

def main():
    p = argparse.ArgumentParser(description='Search YouTube transcripts by keyword(s).')
    p.add_argument('-V','--version', action=VersionAction, help='Show version and exit')
    p.add_argument('-k','--keyword', required=True, help='Comma-separated keywords or a phrase')
    p.add_argument('-c','--channel', help='Channel ID, URL or handle')
    p.add_argument('-s','--sort',
                   choices=['newest','oldest','popular'],
                   default='newest',
                   help='Sort channel videos by newest, oldest, or popular (view count)')
    p.add_argument('-m','--maximum',
                   type=parse_max_videos,
                   help='Maximum number of channel videos to process (e.g. 1.3k, 2m)')
    p.add_argument('-v','--video', help='Single YouTube URL')
    p.add_argument('-f','--file', help='Path to file of YouTube URLs')
    p.add_argument('-x','--length', nargs='+', metavar='\"EXPR\"',
                   help="Filter by video duration with + or - prefixes, e.g. -x \"+5m -2h\" (seconds)")
    args = p.parse_args()

    # parse keywords
    keywords = [k.strip() for k in args.keyword.split(',') if k.strip()]

    api_key = os.getenv('YOUTUBE_API_KEY')
    if not api_key:
        print("Error: set YOUTUBE_API_KEY in .env")
        return

    yt = build('youtube','v3', developerKey=api_key)
    vids = []

    if args.channel:
        try:
            cid = parse_channel_input(yt, args.channel)
            vids += get_video_ids_from_channel(
                yt, cid,
                sort_order=args.sort,
                max_videos=args.maximum
            )
        except Exception as e:
            print(f"Channel fetch error: {e}")

    if args.video:
        vid = extract_video_id(args.video)
        if vid: vids.append(vid)
        else: print(f"Bad video URL: {args.video}")

    if args.file:
        if os.path.isfile(args.file):
            with open(args.file, encoding='utf-8') as f:
                for line in f:
                    link = line.strip()
                    vid = extract_video_id(link)
                    if vid: vids.append(vid)
                    else: print(f"Bad URL in {args.file}: {link}")
        else:
            print(f"File not found: {args.file}")

    vids = list(dict.fromkeys(vids))
    if not vids:
        print("No videos to process."); return

    # apply length filters (allow space/comma‐separated in one string or multiple args)
    if args.length:
        raw_tokens = []
        for chunk in args.length:
            raw_tokens += re.split(r'[,\s]+', chunk.strip())
        raw_tokens = [t for t in raw_tokens if t]
        try:
            exprs = [parse_length_expr(tok) for tok in raw_tokens]
        except argparse.ArgumentTypeError as e:
            print(e)
            return

        durations = fetch_video_durations(yt, vids)
        filtered = []
        for vid in vids:
            d = durations.get(vid)
            if d is None:
                continue  # skip if we can't get duration
            ok = True
            for op, secs in exprs:
                if op == '<'  and not (d <  secs): ok = False
                if op == '<=' and not (d <= secs): ok = False
                if op == '>'  and not (d >  secs): ok = False
                if op == '>=' and not (d >= secs): ok = False
            if ok:
                filtered.append(vid)
        vids = filtered

    total = len(vids)
    if total == 0:
        print("No videos match length filters."); return

    errors = []
    found_keywords = set()

    for idx, vid in enumerate(vids, start=1):
        segments, err = fetch_transcription_segments(vid)
        if err:
            errors.append(err)
        else:
            matches = []
            CONTEXT = 1
            for i, seg in enumerate(segments):
                text = seg.get('text','')
                low_text = text.lower()
                # record any keyword found in this segment
                for kw in keywords:
                    if kw.lower() in low_text:
                        found_keywords.add(kw.lower())
                if any(kw.lower() in low_text for kw in keywords):
                    start_idx = max(0, i - CONTEXT)
                    end_idx   = min(len(segments), i + CONTEXT + 1)
                    ctx = " ".join(s.get('text','') for s in segments[start_idx:end_idx])
                    ts = seg.get('start', 0)
                    link = f"https://www.youtube.com/watch?v={vid}&t={int(ts)}s"
                    tstr = format_timestamp(ts)
                    snippet = highlight(ctx, keywords)
                    matches.append((link, tstr, snippet))

            if matches:
                clear_line()
                title = get_video_title(yt, vid)
                print(f"\n{BLUE}{title}{RESET}\n")
                for link, tstr, snippet in matches:
                    print(f"{link}  ({YELLOW}{tstr}{RESET})\n  …{snippet}…\n")

        update_progress(idx, total)

    # report keywords not found
    missing = [kw for kw in keywords if kw.lower() not in found_keywords]
    if missing:
        print("\nKeywords not found:")
        for kw in missing:
            print(f"  - {kw}")

    if errors:
        print("\nErrors:")
        for e in errors:
            print(" ", e)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nstopping..")
        sys.exit(0)