-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
127 lines (94 loc) · 4.01 KB
/
scraper.py
File metadata and controls
127 lines (94 loc) · 4.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import datetime
import json
import re
import requests
from pathlib import Path
def read_existing_puzzles():
"""Read existing puzzle data from puzzles.js file."""
puzzles_file = Path("src/puzzles.js")
puzzle_data = {}
if puzzles_file.exists():
try:
with open(puzzles_file, "r") as f:
content = f.read()
# Extract the puzzle data object using regex
# Look for: export const puzzleData = { ... };
pattern = r"export const puzzleData = ({.*?});"
match = re.search(pattern, content, re.DOTALL)
if match:
# Get the object string and convert to valid JSON
obj_str = match.group(1)
# Convert JavaScript object to JSON format
obj_str = re.sub(r"(\w+):", r'"\1":', obj_str) # Add quotes around keys
obj_str = re.sub(
r"'([^']*)'", r'"\1"', obj_str
) # Convert single quotes to double
# Remove trailing commas (JSON doesn't allow them)
obj_str = re.sub(r",(\s*[}\]])", r"\1", obj_str)
try:
puzzle_data = json.loads(obj_str)
print(f"📚 Loaded {len(puzzle_data)} existing puzzle dates")
except json.JSONDecodeError as e:
print(f"⚠️ Failed to parse existing puzzle data: {e}")
else:
print("⚠️ No puzzle data found in existing file")
except Exception as e:
print(f"⚠️ Error reading existing puzzles: {e}")
else:
print("📝 No existing puzzles.js found, creating new file")
return puzzle_data
def generate_puzzles_js(puzzle_data):
"""Generate the updated puzzles.js file."""
# Sort dates for consistency
sorted_dates = sorted(puzzle_data.keys())
js_content = "export const puzzleData = {\n"
for date in sorted_dates:
puzzles = puzzle_data[date]
js_content += f" '{date}': {{\n"
for difficulty in ["easy", "medium", "hard"]:
if difficulty in puzzles:
js_content += f" {difficulty}: '{puzzles[difficulty]}',\n"
js_content += " },\n"
js_content += "};\n"
# Write to src directory
output_path = Path("src/puzzles.js")
with open(output_path, "w") as f:
f.write(js_content)
total_puzzles = sum(len(puzzles) for puzzles in puzzle_data.values())
print(f"✅ Updated {output_path}")
print(f"📈 Total: {len(puzzle_data)} dates, {total_puzzles} puzzles")
def main():
print("🔍 Scraping today's puzzles...")
# Read existing puzzle data
puzzle_data = read_existing_puzzles()
# Get today's date
today = datetime.date.today()
date_str = today.strftime("%Y-%m-%d")
# Check if we already have puzzles for today
if date_str in puzzle_data:
print(f"📅 Puzzles for {date_str} already exist, updating...")
else:
print(f"📅 Adding new puzzles for {date_str}")
# Scrape puzzles from NY Times
url = "https://www.nytimes.com/puzzles/sudoku/hard"
response = requests.get(url)
pattern = r'<script type="text\/javascript">window\.gameData = (.+)<\/script><\/div><div id="portal-editorial-content">'
match = re.search(pattern, response.text)
if match:
data = json.loads(match.group(1))
# Extract puzzles for today
today_puzzles = {}
for difficulty in ["easy", "medium", "hard"]:
puzzle = data[difficulty]["puzzle_data"]["puzzle"]
puzzle_string = "".join(str(p) for p in puzzle)
today_puzzles[difficulty] = puzzle_string
print(f"🧩 Scraped {difficulty}: {puzzle_string}")
# Add to puzzle data
puzzle_data[date_str] = today_puzzles
# Generate updated puzzles.js
generate_puzzles_js(puzzle_data)
print(f"🎉 Successfully updated puzzles for {date_str}")
else:
raise Exception("Failed to find game data.")
if __name__ == "__main__":
main()