Skip to content

Commit 5c67b82

Browse files
committed
Refactor subset
1 parent 9ab1754 commit 5c67b82

3 files changed

Lines changed: 58 additions & 51 deletions

File tree

youtube2zim/entrypoint.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def main():
2626
parser.add_argument(
2727
"--subset-by", help="Subset of collection to download",
2828
choices = ["recent", "views", "views-per-year"],
29-
default="views-per-year",
29+
default="recent",
3030
dest="subset_by",
3131
)
3232
parser.add_argument(
@@ -39,6 +39,7 @@ def main():
3939
"--subset-gb",
4040
help="Cumulative size of videos to download (in GB)",
4141
type=float,
42+
default = 0,
4243
dest="subset_gb",
4344
)
4445
parser.add_argument(

youtube2zim/scraper.py

Lines changed: 8 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
import re
1818
import shutil
1919
import subprocess
20-
import pandas as pd
2120
import tempfile
2221
from gettext import gettext as _
2322
from pathlib import Path
@@ -483,25 +482,6 @@ def extract_videos_list(self):
483482
for playlist in self.playlists:
484483
videos_json = get_videos_json(playlist.playlist_id)
485484

486-
# we filter out videos if subset is requested
487-
if self.subset_videos or self.subset_by or self.subset_gb:
488-
videos_json = subset_videos_json(videos_json, self.subset_by, self.subset_videos)
489-
# print a table of videos to be downloaded using pandas with the columns
490-
# video_id, title, view_count, published_at
491-
if self.subset_gb:
492-
# print a table
493-
df = pd.DataFrame(videos_json)
494-
df = df[['contentDetails', 'statistics', 'snippet']]
495-
df['video_id'] = df['contentDetails'].apply(lambda x: x['videoId'])
496-
df['title'] = df['snippet'].apply(lambda x: x['title'])
497-
df['view_count'] = df['statistics'].apply(lambda x: x['viewCount'])
498-
df['published_at'] = df['snippet'].apply(lambda x: x['publishedAt'])
499-
df = df[['video_id', 'title', 'view_count', 'published_at']]
500-
df.to_csv(self.output_dir / "table.csv", index=False)
501-
print(df)
502-
#
503-
# exit(0)
504-
505485
# we replace videos titles if --custom-titles is used
506486
if self.custom_titles:
507487
replace_titles(videos_json, self.custom_titles)
@@ -515,6 +495,14 @@ def extract_videos_list(self):
515495
{v["contentDetails"]["videoId"]: v for v in filter_videos}
516496
)
517497
save_json(self.cache_dir, "videos", all_videos)
498+
499+
if self.subset_by or self.subset_videos or self.subset_gb:
500+
all_videos = subset_videos_json(
501+
all_videos, self.subset_by, self.subset_videos, self.subset_gb
502+
)
503+
# we save the subsetted videos json
504+
save_json(self.cache_dir, "videos", all_videos)
505+
518506
self.videos_ids = [*all_videos.keys()] # unpacking so it's subscriptable
519507

520508
def download_video_files(self, max_concurrency):
@@ -547,24 +535,6 @@ def download_video_files(self, max_concurrency):
547535
if self.all_subtitles:
548536
options.update({"writeautomaticsub": True})
549537

550-
# trim the list of videos to download if we have a subset size
551-
if self.subset_gb:
552-
total_size = 0
553-
videos_ids_subset = []
554-
for video_id in self.videos_ids:
555-
video_size = yt_dlp.YoutubeDL(options).extract_info(
556-
video_id, download=False
557-
)["filesize_approx"] / 1024 / 1024 / 1024
558-
if total_size + video_size <= self.subset_gb:
559-
total_size += video_size
560-
videos_ids_subset.append(video_id)
561-
if video_id == self.videos_ids[-1]:
562-
self.videos_ids = videos_ids_subset
563-
break
564-
else:
565-
self.videos_ids = videos_ids_subset
566-
break
567-
568538
# find number of actuall parallel workers
569539
nb_videos = len(self.videos_ids)
570540
concurrency = nb_videos if nb_videos < max_concurrency else max_concurrency

youtube2zim/youtube.py

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
# -*- coding: utf-8 -*-
33
# vim: ai ts=4 sts=4 et sw=4 nu
44

5+
import json
56
import requests
7+
import yt_dlp
68

79
from contextlib import ExitStack
810
from dateutil import parser as dt_parser
@@ -195,11 +197,14 @@ def get_videos_json(playlist_id):
195197
save_json(YOUTUBE.cache_dir, fname, items)
196198
return items
197199

198-
def subset_videos_json(videos, subset_by, subset_videos):
200+
def subset_videos_json(videos, subset_by, subset_videos, subset_gb):
199201
"""make a list of popular or recent videos"""
200-
playlist_id = videos[0]["snippet"]["playlistId"]
202+
options = {
203+
"ignoreerrors": True,
204+
}
205+
# we need to query the API for the statistics of each video
206+
videos = json.loads(videos)
201207
video_ids = [video["contentDetails"]["videoId"] for video in videos]
202-
# we get the video statistics via Youtube API
203208
video_stats = {}
204209
for i in range(0, len(video_ids), 50):
205210
video_ids_chunk = video_ids[i : i + 50]
@@ -215,13 +220,24 @@ def subset_videos_json(videos, subset_by, subset_videos):
215220
logger.error(f"HTTP {req.status_code} Error response: {req.text}")
216221
req.raise_for_status()
217222
video_stats_json = req.json()
218-
for video in video_stats_json["items"]:
223+
for video in video_stats_json.get("items", []):
219224
video_stats[video["id"]] = video["statistics"]
220-
for video_id in video_ids_chunk:
221-
if video_id not in video_stats:
222-
video_stats[video_id] = {"viewCount": 0, "likeCount": 0, "dislikeCount": 0}
225+
# we add the statistics to the videos if they are in the video_stats dict
223226
for video in videos:
224-
video["statistics"] = video_stats[video["contentDetails"]["videoId"]]
227+
video_id = video["contentDetails"]["videoId"]
228+
if video_id in video_stats:
229+
video["statistics"] = video_stats[video_id]
230+
else:
231+
logger.error(f"video {video_id} not found in video_stats")
232+
# we add a dummy statistics dict with "0" values
233+
video["statistics"] = {
234+
"viewCount": "0",
235+
"likeCount": "0",
236+
"dislikeCount": "0",
237+
"favoriteCount": "0",
238+
"commentCount": "0",
239+
}
240+
# we sort the videos by views or recent or views-per-year
225241
if subset_by == "views":
226242
videos = sorted(videos, key=lambda video: video["statistics"]["viewCount"], reverse=True)
227243
elif subset_by == "recent":
@@ -235,12 +251,32 @@ def subset_videos_json(videos, subset_by, subset_videos):
235251
years = now.year - published_at.year
236252
video["statistics"]["views_per_year"] = int(views) / (years + 1)
237253
videos = sorted(videos, key=lambda video: video["statistics"]["views_per_year"], reverse=True)
238-
# we limit the number of videos if needed
239-
if subset_videos is not None:
240-
videos = videos[:subset_videos]
241-
save_json(YOUTUBE.cache_dir, f"playlist_{playlist_id}_videos", videos)
254+
if subset_videos != 0:
255+
videos_ids = [video["contentDetails"]["videoId"] for video in videos]
256+
videos_ids_subset = videos_ids[:subset_videos]
257+
videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids_subset]
258+
if subset_gb != 0:
259+
total_size = 0
260+
videos_ids_subset = []
261+
for video in videos:
262+
video_id = video["contentDetails"]["videoId"]
263+
video_size = yt_dlp.YoutubeDL(options).extract_info(
264+
video_id, download=False
265+
)["filesize_approx"] / 1024 / 1024 / 1024
266+
if total_size + video_size <= subset_gb:
267+
total_size += video_size
268+
videos_ids_subset.append(video_id)
269+
if video_id == videos[-1]["contentDetails"]["videoId"]:
270+
videos_ids = videos_ids_subset
271+
videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids]
272+
break
273+
else:
274+
videos_ids = videos_ids_subset
275+
videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids]
276+
break
242277
return videos
243278

279+
244280
# Replace some video titles reading 2 text files, one for the video id and one for the title (called with --custom-titles)
245281
def replace_titles(items, custom_titles):
246282
"""replace video titles with custom titles from file"""

0 commit comments

Comments
 (0)