-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathscraper.py
More file actions
104 lines (85 loc) · 3.86 KB
/
scraper.py
File metadata and controls
104 lines (85 loc) · 3.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import random
import string
def get_game_list(callback, driver_path, url):
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu") # Disable GPU
chrome_options.add_argument("--no-sandbox") # Bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage") # Overcome limited resource problems
chrome_options.add_argument("--window-size=1920x1080") # Set window size to mimic a real user
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# Set up the WebDriver
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
try:
driver.get(url)
# Wait for the specific element to be present
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//*[@id="main-wrapper"]/div/div/div/div[1]/div/div[3]/ul'))
)
# Get the page source
page_source = driver.page_source
# Process the page source with BeautifulSoup
soup = BeautifulSoup(page_source, "html.parser")
game_list_element = soup.select_one('#main-wrapper > div > div > div > div:nth-child(1) > div > div:nth-child(3) > ul')
if game_list_element:
games = []
for li in game_list_element.find_all('li'):
a_tag = li.find('a')
title = a_tag.get_text(strip=True)
url = a_tag['href']
games.append((title, url))
# Optionally update cached games
update_game_list(games)
callback(games)
else:
print("Game list element not found in page source.")
except Exception as e:
print(f"Error fetching the page: {e}")
finally:
driver.quit()
def generate_random_filename():
return f"list_{''.join(random.choices(string.ascii_letters + string.digits, k=5))}.json"
def compare_and_delete_files(new_file, cached_file):
with open(new_file, "r", encoding="utf-8") as new_f, open(cached_file, "r", encoding="utf-8") as cached_f:
new_games = json.load(new_f)
cached_games = json.load(cached_f)
new_titles = {game[0] for game in new_games}
cached_titles = {game[0] for game in cached_games}
added_games = new_titles - cached_titles
if added_games:
print(f"New games added: {added_games}")
# Delete the temporary file
os.remove(new_file)
def update_game_list(new_games):
cached_file = "cached_games.json"
new_file = generate_random_filename()
with open(new_file, "w", encoding="utf-8") as file:
json.dump(new_games, file, indent=4)
if os.path.exists(cached_file):
compare_and_delete_files(new_file, cached_file)
# Sort games alphabetically by title and save to cache
sorted_games = sorted(new_games, key=lambda x: x[0])
with open(cached_file, "w", encoding="utf-8") as file:
json.dump(sorted_games, file, indent=4)
def load_cached_games():
if os.path.exists("cached_games.json"):
with open("cached_games.json", "r", encoding="utf-8") as file:
return json.load(file)
return []
if __name__ == "__main__":
# Example usage
def print_games(games):
for title, url in games:
print(f"Title: {title}, URL: {url}")
get_game_list(print_games, "chromedriver.exe", "https://steamunlocked.net/all-games-2/")