-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path12_Infinite_Scrolling.py
More file actions
147 lines (108 loc) · 4.79 KB
/
Copy path12_Infinite_Scrolling.py
File metadata and controls
147 lines (108 loc) · 4.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
import re
import pandas as pd
from datetime import datetime
options = Options()
# options.add_argument('--headless')
options.add_argument('window-size=1920x1080')
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
# ------- POINT TO BE NOTED ------------ #
# Defining Explicit Wait time instead of implicit can improve performance
# ---------------------------------------#
# Instantiate the Chrome driver with the specified options
driver = webdriver.Chrome(options=options)
# ---- URL ----- #
driver.get('https://www.youtube.com/')
time.sleep(3)
# ----- Click Button ----- #
cookie_consent = driver.find_element(By.XPATH,'//*[@id="content"]/div[2]/div[6]/div[1]/ytd-button-renderer[1]/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]')
cookie_consent.click()
time.sleep(3)
# Button change region
nav_button = driver.find_element(By.XPATH,'//yt-icon[@class = "style-scope ytd-topbar-menu-button-renderer"]')
nav_button.click()
time.sleep(1)
# Region Button
region_button = driver.find_element(By.XPATH,'//div[@class="menu-container style-scope ytd-multi-page-menu-renderer"]//yt-multi-page-menu-section-renderer[2]/div[2]/ytd-compact-link-renderer[3]')
region_button.click()
time.sleep(1)
# United States
usa_button = driver.find_element(By.XPATH,'//*[@id="items"]/ytd-compact-link-renderer[104]')
usa_button.click()
time.sleep(5)
# Click nav again
nav_button = driver.find_element(By.XPATH,'//yt-icon[@class = "style-scope ytd-topbar-menu-button-renderer"]')
nav_button.click()
time.sleep(3)
# Language Button
language_button = driver.find_element(By.XPATH,'//div[@class="menu-container style-scope ytd-multi-page-menu-renderer"]//yt-multi-page-menu-section-renderer[2]/div[2]/ytd-compact-link-renderer[1]')
language_button.click()
time.sleep(1)
# English USA
english_usa_button = driver.find_element(By.XPATH,'//*[@id="items"]/ytd-compact-link-renderer[13]')
english_usa_button.click()
time.sleep(3)
# ------ SCROLLING ----------- #
# Define a function to scroll to the bottom of the page
def scroll_to_bottom():
while True:
# Get the current height of the page
current_height = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight );")
# Scroll to the bottom of the page
driver.execute_script(f"window.scrollTo(0, {current_height});")
# Wait for the new content to load (adjust the waiting time as needed)
WebDriverWait(driver, 10).until(
lambda driver: driver.execute_script("return document.readyState") == "complete"
)
time.sleep(5)
# Check if we have reached the bottom of the page
new_height = driver.execute_script("return Math.max( document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight );")
if new_height == current_height:
break
# Call the function to scroll to the bottom of the page
scroll_to_bottom()
time.sleep(3)
# -------- ELEMENTS ----------- #
# Main Path
video_row_path = '//ytd-rich-grid-row[@class= "style-scope ytd-rich-grid-renderer"]//div[@id = "details"]'
# Video Title
video_title_path = './/a[contains(@id,"video-title-link")]/yt-formatted-string'
# channel name
channel_name_path = './/div[contains(@class,"ytd-channel-name")]/div/yt-formatted-string/a'
# Excluding shorts
views_path = './/*[@id="metadata-line"]/span[1]'
# Uploaded Time - Path cannot handle live streams
# uploaded_time = './/*[@id="metadata-line"]/span[2]'
# Extraction timestamp
extraction_timestamp_var = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# ---- Extraction ------- #
video_title = []
channel = []
video_views = []
extraction_timestamp = []
videos = driver.find_elements(By.XPATH,video_row_path)
counter = 0
for video in videos:
video_title.append(video.find_element(By.XPATH,video_title_path).text)
channel.append(video.find_element(By.XPATH,channel_name_path).text)
video_views.append(video.find_element(By.XPATH,views_path).text)
extraction_timestamp.append(extraction_timestamp_var)
print(len(video_title))
print(len(channel))
print(len(video_views))
print(len(extraction_timestamp))
# Create a dictionary with the variables
data = {
'Video_Title' : video_title,
'Channel_Name' : channel,
'Views': video_views,
'Extraction_Timestamp':extraction_timestamp
}
# Create a dataframe
df = pd.DataFrame(data)
# Export Data as CSV
df.to_csv('youtube_homepage_videos.csv', index=False)