Skip to content

Commit 9ab1754

Browse files
committed
Fix private video error and improve subset-gb control
1 parent f88c252 commit 9ab1754

2 files changed

Lines changed: 25 additions & 6 deletions

File tree

youtube2zim/scraper.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import re
1818
import shutil
1919
import subprocess
20+
import pandas as pd
2021
import tempfile
2122
from gettext import gettext as _
2223
from pathlib import Path
@@ -481,8 +482,25 @@ def extract_videos_list(self):
481482
# we only return video_ids that we'll use later on. per-playlist JSON stored
482483
for playlist in self.playlists:
483484
videos_json = get_videos_json(playlist.playlist_id)
484-
if self.subset_videos:
485+
486+
# we filter out videos if subset is requested
487+
if self.subset_videos or self.subset_by or self.subset_gb:
485488
videos_json = subset_videos_json(videos_json, self.subset_by, self.subset_videos)
489+
# print a table of videos to be downloaded using pandas with the columns
490+
# video_id, title, view_count, published_at
491+
if self.subset_gb:
492+
# print a table
493+
df = pd.DataFrame(videos_json)
494+
df = df[['contentDetails', 'statistics', 'snippet']]
495+
df['video_id'] = df['contentDetails'].apply(lambda x: x['videoId'])
496+
df['title'] = df['snippet'].apply(lambda x: x['title'])
497+
df['view_count'] = df['statistics'].apply(lambda x: x['viewCount'])
498+
df['published_at'] = df['snippet'].apply(lambda x: x['publishedAt'])
499+
df = df[['video_id', 'title', 'view_count', 'published_at']]
500+
df.to_csv(self.output_dir / "table.csv", index=False)
501+
print(df)
502+
#
503+
# exit(0)
486504

487505
# we replace videos titles if --custom-titles is used
488506
if self.custom_titles:

youtube2zim/youtube.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def subset_videos_json(videos, subset_by, subset_videos):
199199
"""make a list of popular or recent videos"""
200200
playlist_id = videos[0]["snippet"]["playlistId"]
201201
video_ids = [video["contentDetails"]["videoId"] for video in videos]
202-
# we get the video statistics via Youtube API
202+
# we get the video statistics via Youtube API
203203
video_stats = {}
204204
for i in range(0, len(video_ids), 50):
205205
video_ids_chunk = video_ids[i : i + 50]
@@ -217,10 +217,11 @@ def subset_videos_json(videos, subset_by, subset_videos):
217217
video_stats_json = req.json()
218218
for video in video_stats_json["items"]:
219219
video_stats[video["id"]] = video["statistics"]
220-
# we add the statistics to the videos
220+
for video_id in video_ids_chunk:
221+
if video_id not in video_stats:
222+
video_stats[video_id] = {"viewCount": 0, "likeCount": 0, "dislikeCount": 0}
221223
for video in videos:
222224
video["statistics"] = video_stats[video["contentDetails"]["videoId"]]
223-
# we sort the videos
224225
if subset_by == "views":
225226
videos = sorted(videos, key=lambda video: video["statistics"]["viewCount"], reverse=True)
226227
elif subset_by == "recent":
@@ -234,9 +235,9 @@ def subset_videos_json(videos, subset_by, subset_videos):
234235
years = now.year - published_at.year
235236
video["statistics"]["views_per_year"] = int(views) / (years + 1)
236237
videos = sorted(videos, key=lambda video: video["statistics"]["views_per_year"], reverse=True)
237-
# we make a subset of the videos
238+
# we limit the number of videos if needed
238239
if subset_videos is not None:
239-
videos = videos[:max_videos]
240+
videos = videos[:subset_videos]
240241
save_json(YOUTUBE.cache_dir, f"playlist_{playlist_id}_videos", videos)
241242
return videos
242243

0 commit comments

Comments
 (0)