Skip to content

Commit 1d7f8eb

Browse files
committed
demozoo: parse YYYY and YYYY-MM dates as well
1 parent 3dd6683 commit 1d7f8eb

File tree

1 file changed

+122
-60
lines changed

1 file changed

+122
-60
lines changed

scrapers/py_importers/demozoo.py

+122-60
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,32 @@
1515
########################
1616
### GLOBAL VARIABLES ###
1717
########################
18-
globalgameslist = utils.gimme_global_games_list() # slug in entries folder
19-
logger = Logger(utils.PREFERRED_OUTPUT) # logger will print in file or on console depending on params in utils.PREFERRED_OUTPUT --> LOG or CONSOLE
18+
globalgameslist = utils.gimme_global_games_list() # slug in entries folder
19+
logger = Logger(
20+
utils.PREFERRED_OUTPUT
21+
) # logger will print in file or on console depending on params in utils.PREFERRED_OUTPUT --> LOG or CONSOLE
2022

2123
baseurl = "https://demozoo.org"
2224
blacklist = [
23-
#"missing-colors", # file in a folder...must solve this ASAP
24-
"pdroms-com-relaunch" # duplicate file (and it doesn't have devs specified)
25+
# "missing-colors", # file in a folder...must solve this ASAP
26+
"pdroms-com-relaunch" # duplicate file (and it doesn't have devs specified)
2527
]
2628

2729
#############
2830
### DEBUG ###
2931
#############
30-
added = [] # debug
31-
#as a friendly reminder, remember to change utils.DEBUG flag!
32+
added = [] # debug
33+
# as a friendly reminder, remember to change utils.DEBUG flag!
3234

3335
#################
3436
### CONSTANTS ###
3537
#################
3638

37-
#TODO: GBA placeholder intentionally left here for future development.
39+
# TODO: GBA placeholder intentionally left here for future development.
3840
##
39-
# dict containing demozoo's categories,
40-
# with a mapped "simplified" category according to CONTRIBUTING.MD
41-
# "game", "homebrew", "demo" or "hackrom"
41+
# dict containing demozoo's categories,
42+
# with a mapped "simplified" category according to CONTRIBUTING.MD
43+
# "game", "homebrew", "demo" or "hackrom"
4244
##
4345
PLATFORMS = {
4446
"Gameboy": [38, "GB"],
@@ -50,39 +52,50 @@
5052
# Default: "../../entries
5153
entrypath = "py_common/" + utils.BETA_FOLDER + "/" if utils.DEBUG else "../../entries"
5254

55+
5356
#################
5457
### FUNCTIONS ###
5558
#################
5659
def scrape(platform):
57-
'''
58-
scrape Demozoo prods page and fetches all links
59-
- each link will be processed (scraped) and a Production object will be built
60-
- this object will be used to build JSON, files and folders
61-
'''
60+
"""
61+
scrape Demozoo prods page and fetches all links
62+
- each link will be processed (scraped) and a Production object will be built
63+
- this object will be used to build JSON, files and folders
64+
"""
6265
logger.write("[INFO]", "Scraping platform " + platform)
63-
page = requests.get(baseurl + "/productions/?platform=" + str(PLATFORMS[platform][0]) + "&page=1", timeout=None)
64-
soup = BeautifulSoup(page.content, 'html.parser')
66+
page = requests.get(
67+
baseurl + "/productions/?platform=" + str(PLATFORMS[platform][0]) + "&page=1",
68+
timeout=None,
69+
)
70+
soup = BeautifulSoup(page.content, "html.parser")
6571

6672
# parsing every page
6773
enough_page = True
6874
i = 0
6975
while enough_page:
70-
if soup.find('a', {"title": "Next_page"}):
76+
if soup.find("a", {"title": "Next_page"}):
7177
enough_page = True
7278
else:
7379
enough_page = False
7480

75-
logger.write("[INFO]", "Parsing page: " + str(i+1) )
76-
#TODO: dont call twice this page, as it is called before
77-
78-
page = requests.get(baseurl + "/productions/?platform=" + str(PLATFORMS[platform][0]) + "&page=" + str(i+1), timeout=None)
79-
soup = BeautifulSoup(page.content, 'html.parser')
81+
logger.write("[INFO]", "Parsing page: " + str(i + 1))
82+
# TODO: dont call twice this page, as it is called before
83+
84+
page = requests.get(
85+
baseurl
86+
+ "/productions/?platform="
87+
+ str(PLATFORMS[platform][0])
88+
+ "&page="
89+
+ str(i + 1),
90+
timeout=None,
91+
)
92+
soup = BeautifulSoup(page.content, "html.parser")
8093

8194
# get the big prods table
82-
prodTable = soup.findAll('tbody')[0].findAll('a')
95+
prodTable = soup.findAll("tbody")[0].findAll("a")
8396

8497
# get links "worth to parse" (those ones that links to a production page)
85-
links = [ link for link in prodTable if "productions" in link.get("href") ]
98+
links = [link for link in prodTable if "productions" in link.get("href")]
8699

87100
# get rows; for each rows, get the name of the prod and the internal link
88101
for link in links:
@@ -94,19 +107,21 @@ def scrape(platform):
94107
if slug not in globalgameslist and slug not in blacklist:
95108
# scrape demozoo's page: the returned object will be used to build the file hierarchy
96109
prod = scrape_page(slug, demozoo_internal_link, PLATFORMS[platform][1])
97-
110+
98111
if prod != -1:
99-
#DBGPRINT slugprint
100-
#print(prod.slug)
112+
# DBGPRINT slugprint
113+
# print(prod.slug)
101114

102115
# check if it could be added to database or not
103116
# building files
104-
ret = utils.build(prod, entrypath, ["gb", "gbc"]) # TODO: GBA, add GBA to this list
105-
117+
ret = utils.build(
118+
prod, entrypath, ["gb", "gbc"]
119+
) # TODO: GBA, add GBA to this list
120+
106121
# make required JSON file
107122
if ret != 1:
108123
ret = utils.makeJSON(prod, entrypath)
109-
124+
110125
# useful to print all added entries (to spot duplicates for example)
111126
if utils.DEBUG:
112127
added.append(prod.slug)
@@ -116,59 +131,89 @@ def scrape(platform):
116131
elif slug in globalgameslist:
117132
logger.write("[WARN]", " " + slug + " already in entries folder!")
118133

134+
119135
def parse_date(date_string):
120-
date_part = re.search(r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})", date_string)
121-
136+
date_string = date_string.replace("Released ", "")
137+
138+
date_part = re.search(
139+
r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})", date_string
140+
)
141+
122142
if not date_part:
123143
raise ValueError(f"No recognizable date found in: {date_string}")
124-
144+
125145
date_part = date_part.group(0) # Extract the matched part
126-
127-
parsed_date = datetime.strptime(date_part, "%d %B %Y")
128146

129-
# Convert to desired format
130-
return parsed_date.strftime("%Y-%m-%d")
147+
# Determine the format based on the matched part
148+
try:
149+
if re.match(
150+
r"\d{1,2} [A-Za-z]+ \d{4}", date_part
151+
): # Full date like "1 January 2024"
152+
parsed_date = datetime.strptime(date_part, "%d %B %Y")
153+
return parsed_date.strftime("%Y-%m-%d")
154+
elif re.match(r"[A-Za-z]+ \d{4}", date_part): # Month and year like "June 2009"
155+
parsed_date = datetime.strptime(date_part, "%B %Y")
156+
return parsed_date.strftime("%Y-%m")
157+
elif re.match(r"\d{4}", date_part): # Year only like "2009"
158+
parsed_date = datetime.strptime(date_part, "%Y")
159+
return parsed_date.strftime("%Y")
160+
except ValueError as e:
161+
raise ValueError(f"Error parsing date: {e}")
162+
131163

132164
def scrape_page(slug, url, platform):
133165
demozoo_url = url
134-
'''
166+
"""
135167
given a slug and demozoo production url, it returns an object containing everything useful
136168
to build a file hierarchy
137-
'''
169+
"""
138170
# init variables
139171
screenshots = []
140172
files = []
141173
typetag = ""
142174

143175
page = requests.get(url, timeout=None)
144-
soup = BeautifulSoup(page.content, 'html.parser')
176+
soup = BeautifulSoup(page.content, "html.parser")
145177

146178
# getting title
147-
title = str.strip(soup.find('div', {"class": "production_title focus_title"}).findChildren("h2")[0].text)
179+
title = str.strip(
180+
soup.find("div", {"class": "production_title focus_title"})
181+
.findChildren("h2")[0]
182+
.text
183+
)
148184

149-
date_string = str.strip(soup.find('ul', {"class": "attributes"}).findChildren("li")[0].text)
185+
date_string = str.strip(
186+
soup.find("ul", {"class": "attributes"}).findChildren("li")[0].text
187+
)
150188

151189
release_date = None
152190

153191
try:
154192
release_date = parse_date(date_string)
155193
print(date_string, "->", parse_date(date_string))
156194
except:
157-
print("nodate")
158-
195+
print("COULDN'T PARSE DATE:", date_string)
159196

160197
logger.write("[INFO]", " Adding: " + title + " ...")
161198

162199
# getting developer
163-
developer = str.strip(soup.find('div', {"class": "production_title focus_title"}).findChildren("h3")[0].findChildren("a")[0].text)
164-
200+
developer = str.strip(
201+
soup.find("div", {"class": "production_title focus_title"})
202+
.findChildren("h3")[0]
203+
.findChildren("a")[0]
204+
.text
205+
)
206+
165207
# fetching tag
166-
list_typetag = soup.find('li', {"class": "signpost"})
208+
list_typetag = soup.find("li", {"class": "signpost"})
167209
if list_typetag == None:
168210
typetag = ""
169211
else:
170-
typetag = str.strip(list_typetag.text if not isinstance(list_typetag, list) else list_typetag[0].text)
171-
212+
typetag = str.strip(
213+
list_typetag.text
214+
if not isinstance(list_typetag, list)
215+
else list_typetag[0].text
216+
)
172217

173218
if "TRO" in typetag.upper() or "DEMO" in typetag.upper():
174219
typetag = "demo"
@@ -181,9 +226,9 @@ def scrape_page(slug, url, platform):
181226
else:
182227
logger.write("[WARN]", " We don't care about this category: " + typetag)
183228
return -1
184-
229+
185230
# fetching screenshot
186-
screen_obj = soup.find('a', {"class": "screenshot"})
231+
screen_obj = soup.find("a", {"class": "screenshot"})
187232
if screen_obj is not None:
188233
screenshot = screen_obj.get("href")
189234
else:
@@ -196,7 +241,7 @@ def scrape_page(slug, url, platform):
196241
source = source.get("href") if source else ""
197242

198243
# fetching url (if present)
199-
url = soup.find('ul', {"class": "download_links"})
244+
url = soup.find("ul", {"class": "download_links"})
200245
if url is not None:
201246
url = url.findChildren("a")
202247
else:
@@ -210,7 +255,10 @@ def scrape_page(slug, url, platform):
210255
elif len(url) == 1:
211256
url = url[0].get("href")
212257
if "modermodemet.se" in url:
213-
logger.write("[ERR]", " modermodemet.se is not available, and no other valid link has been found")
258+
logger.write(
259+
"[ERR]",
260+
" modermodemet.se is not available, and no other valid link has been found",
261+
)
214262
return -1
215263
elif len(url) >= 2:
216264
# because almost always the prod will have the secondary mirror as scene.org or smth like that
@@ -221,19 +269,33 @@ def scrape_page(slug, url, platform):
221269
# fetching video
222270
video = soup.find(lambda tag: tag.name == "a" and "youtube" in tag.text.lower())
223271
video = video.get("href") if video else ""
224-
272+
225273
files = [f"{slug}.{platform.lower()}"]
226274

227-
return Production(title, slug, developer, platform, typetag, screenshots, files, video, date=release_date, repository=source, url=demozoo_url)
275+
return Production(
276+
title,
277+
slug,
278+
developer,
279+
platform,
280+
typetag,
281+
screenshots,
282+
files,
283+
video,
284+
date=release_date,
285+
repository=source,
286+
url=demozoo_url,
287+
)
288+
228289

229290
def main():
230291
for platform in PLATFORMS.keys():
231-
logger.write("[INFO]","Parsing platform: " + platform)
292+
logger.write("[INFO]", "Parsing platform: " + platform)
232293
scrape(platform)
233-
294+
295+
234296
main()
235297

236298
if utils.DEBUG:
237-
[ logger.write("[TITLE]", f) for f in added ]
299+
[logger.write("[TITLE]", f) for f in added]
238300

239-
logger.write("[INFO]", "demozoo importer ended!")
301+
logger.write("[INFO]", "demozoo importer ended!")

0 commit comments

Comments
 (0)