Skip to content

Commit 1e417f8

Browse files
committed
demozoo scraper: add original link as website, add release date
1 parent 1a48b52 commit 1e417f8

File tree

2 files changed

+32
-3
lines changed

2 files changed

+32
-3
lines changed

scrapers/py_importers/demozoo.py

+29-3
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66

77
import requests
88
from bs4 import BeautifulSoup
9-
9+
import re
1010
from py_common.Logger import Logger
1111
from py_common.Production import Production
1212
import py_common.utils as utils
13+
from datetime import datetime
1314

1415
########################
1516
### GLOBAL VARIABLES ###
@@ -86,7 +87,7 @@ def scrape(platform):
8687
# get rows; for each rows, get the name of the prod and the internal link
8788
for link in links:
8889
demozoo_internal_link = baseurl + "/" + link.get("href")
89-
90+
print(demozoo_internal_link)
9091
# building slug: all lowercase, each word separated by hyphen, no special character
9192
slug = utils.build_slug(link.text)
9293

@@ -115,7 +116,21 @@ def scrape(platform):
115116
elif slug in globalgameslist:
116117
logger.write("[WARN]", " " + slug + " already in entries folder!")
117118

119+
def parse_date(date_string):
120+
date_part = re.search(r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})", date_string)
121+
122+
if not date_part:
123+
raise ValueError(f"No recognizable date found in: {date_string}")
124+
125+
date_part = date_part.group(0) # Extract the matched part
126+
127+
parsed_date = datetime.strptime(date_part, "%d %B %Y")
128+
129+
# Convert to desired format
130+
return parsed_date.strftime("%Y-%m-%d")
131+
118132
def scrape_page(slug, url, platform):
133+
demozoo_url = url
119134
'''
120135
given a slug and demozoo production url, it returns an object containing everything useful
121136
to build a file hierarchy
@@ -131,6 +146,17 @@ def scrape_page(slug, url, platform):
131146
# getting title
132147
title = str.strip(soup.find('div', {"class": "production_title focus_title"}).findChildren("h2")[0].text)
133148

149+
date_string = str.strip(soup.find('ul', {"class": "attributes"}).findChildren("li")[0].text)
150+
151+
release_date = None
152+
153+
try:
154+
release_date = parse_date(date_string)
155+
print(date_string, "->", parse_date(date_string))
156+
except:
157+
print("nodate")
158+
159+
134160
logger.write("[INFO]", " Adding: " + title + " ...")
135161

136162
# getting developer
@@ -198,7 +224,7 @@ def scrape_page(slug, url, platform):
198224

199225
files = [f"{slug}.{platform.lower()}"]
200226

201-
return Production(title, slug, developer, platform, typetag, screenshots, files, video, repository=source, url=url)
227+
return Production(title, slug, developer, platform, typetag, screenshots, files, video, date=release_date, repository=source, url=demozoo_url)
202228

203229
def main():
204230
for platform in PLATFORMS.keys():

scrapers/py_importers/py_common/utils.py

+3
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ def build(prod, entrypath: str, desired_extensions: list):
198198

199199
# Handle screenshots
200200
if prod.screenshots and prod.screenshots[0] != "None":
201+
print(prod.screenshots)
201202
try:
202203
r = requests.get(prod.screenshots[0], allow_redirects=True, timeout=None)
203204
screen_ext = prod.screenshots[0].split(".")[-1].lower()
@@ -270,6 +271,8 @@ def makeJSON(prod, entrypath):
270271
"screenshots": [screen for screen in prod.screenshots] if len(prod.screenshots) != 0 else [],
271272
"slug": prod.slug,
272273
"title": prod.title,
274+
"website": [ prod.url ],
275+
"date": prod.date
273276
}
274277

275278
# adding optional fields

0 commit comments

Comments
 (0)