Refactor subset

deldesir · deldesir · commit 5c67b827a290 · 2023-03-27T13:39:38.000-04:00
diff --git a/youtube2zim/entrypoint.py b/youtube2zim/entrypoint.py
@@ -26,7 +26,7 @@ def main():
     parser.add_argument(
         "--subset-by", help="Subset of collection to download",
         choices = ["recent", "views", "views-per-year"],
-        default="views-per-year",
+        default="recent",
         dest="subset_by",
     )
     parser.add_argument(
@@ -39,6 +39,7 @@ def main():
         "--subset-gb",
         help="Cumulative size of videos to download (in GB)",
         type=float,
+        default = 0,
         dest="subset_gb",
     )
     parser.add_argument(
diff --git a/youtube2zim/scraper.py b/youtube2zim/scraper.py
@@ -17,7 +17,6 @@
 import re
 import shutil
 import subprocess
-import pandas as pd
 import tempfile
 from gettext import gettext as _
 from pathlib import Path
@@ -483,25 +482,6 @@ def extract_videos_list(self):
             for playlist in self.playlists:
                 videos_json = get_videos_json(playlist.playlist_id)
 
-                # we filter out videos if subset is requested
-                if self.subset_videos or self.subset_by or self.subset_gb:
-                    videos_json = subset_videos_json(videos_json, self.subset_by, self.subset_videos)
-                    # print a table of videos to be downloaded using pandas with the columns
-                    # video_id, title, view_count, published_at
-                    if self.subset_gb:
-                        # print a table
-                        df = pd.DataFrame(videos_json)
-                        df = df[['contentDetails', 'statistics', 'snippet']]    
-                        df['video_id'] = df['contentDetails'].apply(lambda x: x['videoId'])
-                        df['title'] = df['snippet'].apply(lambda x: x['title'])
-                        df['view_count'] = df['statistics'].apply(lambda x: x['viewCount'])
-                        df['published_at'] = df['snippet'].apply(lambda x: x['publishedAt'])
-                        df = df[['video_id', 'title', 'view_count', 'published_at']]
-                        df.to_csv(self.output_dir / "table.csv", index=False)
-                        print(df)
-                        # 
-                        # exit(0)
-
                 # we replace videos titles if --custom-titles is used
                 if self.custom_titles:
                     replace_titles(videos_json, self.custom_titles)
@@ -515,6 +495,14 @@ def extract_videos_list(self):
                     {v["contentDetails"]["videoId"]: v for v in filter_videos}
                 )
             save_json(self.cache_dir, "videos", all_videos)
+
+        if self.subset_by or self.subset_videos or self.subset_gb:
+            all_videos = subset_videos_json(
+                all_videos, self.subset_by, self.subset_videos, self.subset_gb
+            )
+            # we save the subsetted videos json
+            save_json(self.cache_dir, "videos", all_videos)
+
         self.videos_ids = [*all_videos.keys()]  # unpacking so it's subscriptable
 
     def download_video_files(self, max_concurrency):
@@ -547,24 +535,6 @@ def download_video_files(self, max_concurrency):
         if self.all_subtitles:
             options.update({"writeautomaticsub": True})
 
-        # trim the list of videos to download if we have a subset size
-        if self.subset_gb:
-            total_size = 0
-            videos_ids_subset = []
-            for video_id in self.videos_ids:
-                video_size = yt_dlp.YoutubeDL(options).extract_info(
-                    video_id, download=False
-                )["filesize_approx"] / 1024 / 1024 / 1024
-                if total_size + video_size <= self.subset_gb:
-                    total_size += video_size
-                    videos_ids_subset.append(video_id)
-                    if video_id == self.videos_ids[-1]:
-                        self.videos_ids = videos_ids_subset
-                        break
-                else:
-                    self.videos_ids = videos_ids_subset
-                    break
-
         # find number of actuall parallel workers
         nb_videos = len(self.videos_ids)
         concurrency = nb_videos if nb_videos < max_concurrency else max_concurrency
diff --git a/youtube2zim/youtube.py b/youtube2zim/youtube.py
@@ -2,7 +2,9 @@
 # -*- coding: utf-8 -*-
 # vim: ai ts=4 sts=4 et sw=4 nu
 
+import json
 import requests
+import yt_dlp
 
 from contextlib import ExitStack
 from dateutil import parser as dt_parser
@@ -195,11 +197,14 @@ def get_videos_json(playlist_id):
     save_json(YOUTUBE.cache_dir, fname, items)
     return items
 
-def subset_videos_json(videos, subset_by, subset_videos):
+def subset_videos_json(videos, subset_by, subset_videos, subset_gb):
     """make a list of popular or recent videos"""
-    playlist_id = videos[0]["snippet"]["playlistId"]
+    options = {
+            "ignoreerrors": True,
+        }
+    # we need to query the API for the statistics of each video
+    videos = json.loads(videos)
     video_ids = [video["contentDetails"]["videoId"] for video in videos]
-    # we get the video statistics via Youtube API 
     video_stats = {}
     for i in range(0, len(video_ids), 50):
         video_ids_chunk = video_ids[i : i + 50]
@@ -215,13 +220,24 @@ def subset_videos_json(videos, subset_by, subset_videos):
             logger.error(f"HTTP {req.status_code} Error response: {req.text}")
         req.raise_for_status()
         video_stats_json = req.json()
-        for video in video_stats_json["items"]:
+        for video in video_stats_json.get("items", []):
             video_stats[video["id"]] = video["statistics"]
-        for video_id in video_ids_chunk:
-            if video_id not in video_stats:
-                video_stats[video_id] = {"viewCount": 0, "likeCount": 0, "dislikeCount": 0}
+    # we add the statistics to the videos if they are in the video_stats dict
     for video in videos:
-        video["statistics"] = video_stats[video["contentDetails"]["videoId"]]
+        video_id = video["contentDetails"]["videoId"]
+        if video_id in video_stats:
+            video["statistics"] = video_stats[video_id]
+        else:
+            logger.error(f"video {video_id} not found in video_stats")
+            # we add a dummy statistics dict with "0" values
+            video["statistics"] = {
+                "viewCount": "0",
+                "likeCount": "0",
+                "dislikeCount": "0",
+                "favoriteCount": "0",
+                "commentCount": "0",
+            }
+    # we sort the videos by views or recent or views-per-year
     if subset_by == "views":
         videos = sorted(videos, key=lambda video: video["statistics"]["viewCount"], reverse=True)
     elif subset_by == "recent":
@@ -235,12 +251,32 @@ def subset_videos_json(videos, subset_by, subset_videos):
             years = now.year - published_at.year
             video["statistics"]["views_per_year"] = int(views) / (years + 1)
         videos = sorted(videos, key=lambda video: video["statistics"]["views_per_year"], reverse=True)
-    # we limit the number of videos if needed
-    if subset_videos is not None:
-        videos = videos[:subset_videos]
-    save_json(YOUTUBE.cache_dir, f"playlist_{playlist_id}_videos", videos)
+    if subset_videos != 0:
+        videos_ids = [video["contentDetails"]["videoId"] for video in videos]
+        videos_ids_subset = videos_ids[:subset_videos]
+        videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids_subset]
+    if subset_gb != 0:
+        total_size = 0
+        videos_ids_subset = []
+        for video in videos:
+            video_id = video["contentDetails"]["videoId"]
+            video_size = yt_dlp.YoutubeDL(options).extract_info(
+                video_id, download=False
+            )["filesize_approx"] / 1024 / 1024 / 1024
+            if total_size + video_size <= subset_gb:
+                total_size += video_size
+                videos_ids_subset.append(video_id)
+                if video_id == videos[-1]["contentDetails"]["videoId"]:
+                    videos_ids = videos_ids_subset
+                    videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids]
+                    break
+            else:
+                videos_ids = videos_ids_subset
+                videos = [video for video in videos if video["contentDetails"]["videoId"] in videos_ids]
+                break
     return videos
 
+
 # Replace some video titles reading 2 text files, one for the video id and one for the title (called with --custom-titles)
 def replace_titles(items, custom_titles):
     """replace video titles with custom titles from file"""