-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfast_download.py
More file actions
129 lines (104 loc) · 4.93 KB
/
fast_download.py
File metadata and controls
129 lines (104 loc) · 4.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python
# fast_download.py - Quickly download YouTube videos and queue them for later processing
import os
import json
import argparse
import time
from typing import List, Dict
from dotenv import load_dotenv
# Load the main application config
from youtube_qa_app import Config
def main():
parser = argparse.ArgumentParser(description="Fast YouTube downloader")
parser.add_argument("url", help="YouTube video URL or file with URLs")
parser.add_argument("--batch", action="store_true", help="URL is a file containing multiple URLs")
parser.add_argument("--tags", help="Comma-separated list of tags to associate with the video")
parser.add_argument("--whisper-model", choices=["tiny", "base", "small", "medium", "large"],
help="Whisper model to use for transcription")
parser.add_argument("--chunk-size", type=int, help="Override the default chunk size")
parser.add_argument("--chunk-overlap", type=int, help="Override the default chunk overlap")
parser.add_argument("--no-update", action="store_true",
help="Don't update existing entries (add new ones instead)")
args = parser.parse_args()
# Load environment variables
load_dotenv()
# Initialize config
config = Config()
# Create directories
os.makedirs(config.AUDIO_DIR, exist_ok=True)
pending_dir = os.path.join(config.AUDIO_DIR, "pending")
os.makedirs(pending_dir, exist_ok=True)
# Process tags
tags = None
if args.tags:
tags = [tag.strip() for tag in args.tags.split(',')]
# Process URLs
if args.batch:
# Process multiple URLs from a file
if not os.path.exists(args.url):
print(f"Batch file not found: {args.url}")
return
with open(args.url, 'r') as f:
urls = [line.strip() for line in f if line.strip()]
print(f"Found {len(urls)} URLs to process")
for i, url in enumerate(urls):
print(f"\nProcessing URL {i+1}/{len(urls)}: {url}")
download_video(url, config, args, tags)
else:
# Process a single URL
download_video(args.url, config, args, tags)
print("\nAll downloads complete. To process these videos, run:")
print("python process_later.py --all")
def download_video(video_url: str, config: Config, args, tags: List[str] = None) -> bool:
"""Download a YouTube video and queue it for later processing"""
try:
import yt_dlp
# Extract video ID from URL
video_id = video_url.split("watch?v=")[1].split("&")[0]
# Check if audio file already exists
audio_path = os.path.join(config.AUDIO_DIR, f"{video_id}.mp3")
pending_file = os.path.join(config.AUDIO_DIR, "pending", f"{video_id}.json")
if os.path.exists(audio_path):
print(f"Audio file already exists: {audio_path}")
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
info = ydl.extract_info(video_url, download=False)
title = info.get('title', 'Unknown Title')
print(f"Using existing audio file for: {title}")
else:
# Set up yt-dlp options
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': f'{config.AUDIO_DIR}/{video_id}',
'progress_hooks': [lambda d: print(f"\rDownloading: {d['_percent_str']} of {d.get('_total_bytes_str', 'Unknown size')} ", end='')],
}
# Download the audio
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(video_url, download=True)
title = info.get('title', 'Unknown Title')
print(f"\nDownloaded audio for: {title}")
# Create or update pending file with metadata
metadata = {
'video_id': video_id,
'title': title,
'url': video_url,
'date_downloaded': time.strftime('%Y-%m-%d %H:%M:%S'),
'whisper_model': args.whisper_model if args.whisper_model else config.WHISPER_MODEL,
'chunk_size': args.chunk_size if args.chunk_size else config.CHUNK_SIZE,
'chunk_overlap': args.chunk_overlap if args.chunk_overlap else config.CHUNK_OVERLAP,
'update': not args.no_update if hasattr(args, 'no_update') else True,
'tags': tags
}
with open(pending_file, 'w') as f:
json.dump(metadata, f, indent=2)
print(f"Video queued for processing: {title}")
return True
except Exception as e:
print(f"Error downloading video {video_url}: {str(e)}")
return False
if __name__ == "__main__":
main()