6
6
7
7
import requests
8
8
from bs4 import BeautifulSoup
9
-
9
+ import re
10
10
from py_common .Logger import Logger
11
11
from py_common .Production import Production
12
12
import py_common .utils as utils
13
+ from datetime import datetime
13
14
14
15
########################
15
16
### GLOBAL VARIABLES ###
@@ -86,7 +87,7 @@ def scrape(platform):
86
87
# get rows; for each rows, get the name of the prod and the internal link
87
88
for link in links :
88
89
demozoo_internal_link = baseurl + "/" + link .get ("href" )
89
-
90
+ print ( demozoo_internal_link )
90
91
# building slug: all lowercase, each word separated by hyphen, no special character
91
92
slug = utils .build_slug (link .text )
92
93
@@ -115,7 +116,21 @@ def scrape(platform):
115
116
elif slug in globalgameslist :
116
117
logger .write ("[WARN]" , " " + slug + " already in entries folder!" )
117
118
119
+ def parse_date (date_string ):
120
+ date_part = re .search (r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})" , date_string )
121
+
122
+ if not date_part :
123
+ raise ValueError (f"No recognizable date found in: { date_string } " )
124
+
125
+ date_part = date_part .group (0 ) # Extract the matched part
126
+
127
+ parsed_date = datetime .strptime (date_part , "%d %B %Y" )
128
+
129
+ # Convert to desired format
130
+ return parsed_date .strftime ("%Y-%m-%d" )
131
+
118
132
def scrape_page (slug , url , platform ):
133
+ demozoo_url = url
119
134
'''
120
135
given a slug and demozoo production url, it returns an object containing everything useful
121
136
to build a file hierarchy
@@ -131,6 +146,17 @@ def scrape_page(slug, url, platform):
131
146
# getting title
132
147
title = str .strip (soup .find ('div' , {"class" : "production_title focus_title" }).findChildren ("h2" )[0 ].text )
133
148
149
+ date_string = str .strip (soup .find ('ul' , {"class" : "attributes" }).findChildren ("li" )[0 ].text )
150
+
151
+ release_date = None
152
+
153
+ try :
154
+ release_date = parse_date (date_string )
155
+ print (date_string , "->" , parse_date (date_string ))
156
+ except :
157
+ print ("nodate" )
158
+
159
+
134
160
logger .write ("[INFO]" , " Adding: " + title + " ..." )
135
161
136
162
# getting developer
@@ -198,7 +224,7 @@ def scrape_page(slug, url, platform):
198
224
199
225
files = [f"{ slug } .{ platform .lower ()} " ]
200
226
201
- return Production (title , slug , developer , platform , typetag , screenshots , files , video , repository = source , url = url )
227
+ return Production (title , slug , developer , platform , typetag , screenshots , files , video , date = release_date , repository = source , url = demozoo_url )
202
228
203
229
def main ():
204
230
for platform in PLATFORMS .keys ():
0 commit comments