Fix private video error and improve subset-gb control

deldesir · deldesir · commit 9ab1754b6cf3 · 2023-03-24T20:57:21.000-04:00
diff --git a/youtube2zim/scraper.py b/youtube2zim/scraper.py
@@ -17,6 +17,7 @@
 import re
 import shutil
 import subprocess
+import pandas as pd
 import tempfile
 from gettext import gettext as _
 from pathlib import Path
@@ -481,8 +482,25 @@ def extract_videos_list(self):
             # we only return video_ids that we'll use later on. per-playlist JSON stored
             for playlist in self.playlists:
                 videos_json = get_videos_json(playlist.playlist_id)
-                if self.subset_videos:
+
+                # we filter out videos if subset is requested
+                if self.subset_videos or self.subset_by or self.subset_gb:
                     videos_json = subset_videos_json(videos_json, self.subset_by, self.subset_videos)
+                    # print a table of videos to be downloaded using pandas with the columns
+                    # video_id, title, view_count, published_at
+                    if self.subset_gb:
+                        # print a table
+                        df = pd.DataFrame(videos_json)
+                        df = df[['contentDetails', 'statistics', 'snippet']]    
+                        df['video_id'] = df['contentDetails'].apply(lambda x: x['videoId'])
+                        df['title'] = df['snippet'].apply(lambda x: x['title'])
+                        df['view_count'] = df['statistics'].apply(lambda x: x['viewCount'])
+                        df['published_at'] = df['snippet'].apply(lambda x: x['publishedAt'])
+                        df = df[['video_id', 'title', 'view_count', 'published_at']]
+                        df.to_csv(self.output_dir / "table.csv", index=False)
+                        print(df)
+                        # 
+                        # exit(0)
 
                 # we replace videos titles if --custom-titles is used
                 if self.custom_titles:
diff --git a/youtube2zim/youtube.py b/youtube2zim/youtube.py
@@ -199,7 +199,7 @@ def subset_videos_json(videos, subset_by, subset_videos):
     """make a list of popular or recent videos"""
     playlist_id = videos[0]["snippet"]["playlistId"]
     video_ids = [video["contentDetails"]["videoId"] for video in videos]
-    # we get the video statistics via Youtube API
+    # we get the video statistics via Youtube API 
     video_stats = {}
     for i in range(0, len(video_ids), 50):
         video_ids_chunk = video_ids[i : i + 50]
@@ -217,10 +217,11 @@ def subset_videos_json(videos, subset_by, subset_videos):
         video_stats_json = req.json()
         for video in video_stats_json["items"]:
             video_stats[video["id"]] = video["statistics"]
-    # we add the statistics to the videos
+        for video_id in video_ids_chunk:
+            if video_id not in video_stats:
+                video_stats[video_id] = {"viewCount": 0, "likeCount": 0, "dislikeCount": 0}
     for video in videos:
         video["statistics"] = video_stats[video["contentDetails"]["videoId"]]
-    # we sort the videos
     if subset_by == "views":
         videos = sorted(videos, key=lambda video: video["statistics"]["viewCount"], reverse=True)
     elif subset_by == "recent":
@@ -234,9 +235,9 @@ def subset_videos_json(videos, subset_by, subset_videos):
             years = now.year - published_at.year
             video["statistics"]["views_per_year"] = int(views) / (years + 1)
         videos = sorted(videos, key=lambda video: video["statistics"]["views_per_year"], reverse=True)
-    # we make a subset of the videos
+    # we limit the number of videos if needed
     if subset_videos is not None:
-        videos = videos[:max_videos]
+        videos = videos[:subset_videos]
     save_json(YOUTUBE.cache_dir, f"playlist_{playlist_id}_videos", videos)
     return videos