15
15
########################
16
16
### GLOBAL VARIABLES ###
17
17
########################
18
- globalgameslist = utils .gimme_global_games_list () # slug in entries folder
19
- logger = Logger (utils .PREFERRED_OUTPUT ) # logger will print in file or on console depending on params in utils.PREFERRED_OUTPUT --> LOG or CONSOLE
18
+ globalgameslist = utils .gimme_global_games_list () # slug in entries folder
19
+ logger = Logger (
20
+ utils .PREFERRED_OUTPUT
21
+ ) # logger will print in file or on console depending on params in utils.PREFERRED_OUTPUT --> LOG or CONSOLE
20
22
21
23
baseurl = "https://demozoo.org"
22
24
blacklist = [
23
- #"missing-colors", # file in a folder...must solve this ASAP
24
- "pdroms-com-relaunch" # duplicate file (and it doesn't have devs specified)
25
+ # "missing-colors", # file in a folder...must solve this ASAP
26
+ "pdroms-com-relaunch" # duplicate file (and it doesn't have devs specified)
25
27
]
26
28
27
29
#############
28
30
### DEBUG ###
29
31
#############
30
- added = [] # debug
31
- #as a friendly reminder, remember to change utils.DEBUG flag!
32
+ added = [] # debug
33
+ # as a friendly reminder, remember to change utils.DEBUG flag!
32
34
33
35
#################
34
36
### CONSTANTS ###
35
37
#################
36
38
37
- #TODO: GBA placeholder intentionally left here for future development.
39
+ # TODO: GBA placeholder intentionally left here for future development.
38
40
##
39
- # dict containing demozoo's categories,
40
- # with a mapped "simplified" category according to CONTRIBUTING.MD
41
- # "game", "homebrew", "demo" or "hackrom"
41
+ # dict containing demozoo's categories,
42
+ # with a mapped "simplified" category according to CONTRIBUTING.MD
43
+ # "game", "homebrew", "demo" or "hackrom"
42
44
##
43
45
PLATFORMS = {
44
46
"Gameboy" : [38 , "GB" ],
50
52
# Default: "../../entries
51
53
entrypath = "py_common/" + utils .BETA_FOLDER + "/" if utils .DEBUG else "../../entries"
52
54
55
+
53
56
#################
54
57
### FUNCTIONS ###
55
58
#################
56
59
def scrape (platform ):
57
- '''
58
- scrape Demozoo prods page and fetches all links
59
- - each link will be processed (scraped) and a Production object will be built
60
- - this object will be used to build JSON, files and folders
61
- '''
60
+ """
61
+ scrape Demozoo prods page and fetches all links
62
+ - each link will be processed (scraped) and a Production object will be built
63
+ - this object will be used to build JSON, files and folders
64
+ """
62
65
logger .write ("[INFO]" , "Scraping platform " + platform )
63
- page = requests .get (baseurl + "/productions/?platform=" + str (PLATFORMS [platform ][0 ]) + "&page=1" , timeout = None )
64
- soup = BeautifulSoup (page .content , 'html.parser' )
66
+ page = requests .get (
67
+ baseurl + "/productions/?platform=" + str (PLATFORMS [platform ][0 ]) + "&page=1" ,
68
+ timeout = None ,
69
+ )
70
+ soup = BeautifulSoup (page .content , "html.parser" )
65
71
66
72
# parsing every page
67
73
enough_page = True
68
74
i = 0
69
75
while enough_page :
70
- if soup .find ('a' , {"title" : "Next_page" }):
76
+ if soup .find ("a" , {"title" : "Next_page" }):
71
77
enough_page = True
72
78
else :
73
79
enough_page = False
74
80
75
- logger .write ("[INFO]" , "Parsing page: " + str (i + 1 ) )
76
- #TODO: dont call twice this page, as it is called before
77
-
78
- page = requests .get (baseurl + "/productions/?platform=" + str (PLATFORMS [platform ][0 ]) + "&page=" + str (i + 1 ), timeout = None )
79
- soup = BeautifulSoup (page .content , 'html.parser' )
81
+ logger .write ("[INFO]" , "Parsing page: " + str (i + 1 ))
82
+ # TODO: dont call twice this page, as it is called before
83
+
84
+ page = requests .get (
85
+ baseurl
86
+ + "/productions/?platform="
87
+ + str (PLATFORMS [platform ][0 ])
88
+ + "&page="
89
+ + str (i + 1 ),
90
+ timeout = None ,
91
+ )
92
+ soup = BeautifulSoup (page .content , "html.parser" )
80
93
81
94
# get the big prods table
82
- prodTable = soup .findAll (' tbody' )[0 ].findAll ('a' )
95
+ prodTable = soup .findAll (" tbody" )[0 ].findAll ("a" )
83
96
84
97
# get links "worth to parse" (those ones that links to a production page)
85
- links = [ link for link in prodTable if "productions" in link .get ("href" ) ]
98
+ links = [link for link in prodTable if "productions" in link .get ("href" )]
86
99
87
100
# get rows; for each rows, get the name of the prod and the internal link
88
101
for link in links :
@@ -94,19 +107,21 @@ def scrape(platform):
94
107
if slug not in globalgameslist and slug not in blacklist :
95
108
# scrape demozoo's page: the returned object will be used to build the file hierarchy
96
109
prod = scrape_page (slug , demozoo_internal_link , PLATFORMS [platform ][1 ])
97
-
110
+
98
111
if prod != - 1 :
99
- #DBGPRINT slugprint
100
- #print(prod.slug)
112
+ # DBGPRINT slugprint
113
+ # print(prod.slug)
101
114
102
115
# check if it could be added to database or not
103
116
# building files
104
- ret = utils .build (prod , entrypath , ["gb" , "gbc" ]) # TODO: GBA, add GBA to this list
105
-
117
+ ret = utils .build (
118
+ prod , entrypath , ["gb" , "gbc" ]
119
+ ) # TODO: GBA, add GBA to this list
120
+
106
121
# make required JSON file
107
122
if ret != 1 :
108
123
ret = utils .makeJSON (prod , entrypath )
109
-
124
+
110
125
# useful to print all added entries (to spot duplicates for example)
111
126
if utils .DEBUG :
112
127
added .append (prod .slug )
@@ -116,59 +131,89 @@ def scrape(platform):
116
131
elif slug in globalgameslist :
117
132
logger .write ("[WARN]" , " " + slug + " already in entries folder!" )
118
133
134
+
119
135
def parse_date (date_string ):
120
- date_part = re .search (r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})" , date_string )
121
-
136
+ date_string = date_string .replace ("Released " , "" )
137
+
138
+ date_part = re .search (
139
+ r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})" , date_string
140
+ )
141
+
122
142
if not date_part :
123
143
raise ValueError (f"No recognizable date found in: { date_string } " )
124
-
144
+
125
145
date_part = date_part .group (0 ) # Extract the matched part
126
-
127
- parsed_date = datetime .strptime (date_part , "%d %B %Y" )
128
146
129
- # Convert to desired format
130
- return parsed_date .strftime ("%Y-%m-%d" )
147
+ # Determine the format based on the matched part
148
+ try :
149
+ if re .match (
150
+ r"\d{1,2} [A-Za-z]+ \d{4}" , date_part
151
+ ): # Full date like "1 January 2024"
152
+ parsed_date = datetime .strptime (date_part , "%d %B %Y" )
153
+ return parsed_date .strftime ("%Y-%m-%d" )
154
+ elif re .match (r"[A-Za-z]+ \d{4}" , date_part ): # Month and year like "June 2009"
155
+ parsed_date = datetime .strptime (date_part , "%B %Y" )
156
+ return parsed_date .strftime ("%Y-%m" )
157
+ elif re .match (r"\d{4}" , date_part ): # Year only like "2009"
158
+ parsed_date = datetime .strptime (date_part , "%Y" )
159
+ return parsed_date .strftime ("%Y" )
160
+ except ValueError as e :
161
+ raise ValueError (f"Error parsing date: { e } " )
162
+
131
163
132
164
def scrape_page (slug , url , platform ):
133
165
demozoo_url = url
134
- '''
166
+ """
135
167
given a slug and demozoo production url, it returns an object containing everything useful
136
168
to build a file hierarchy
137
- '''
169
+ """
138
170
# init variables
139
171
screenshots = []
140
172
files = []
141
173
typetag = ""
142
174
143
175
page = requests .get (url , timeout = None )
144
- soup = BeautifulSoup (page .content , ' html.parser' )
176
+ soup = BeautifulSoup (page .content , " html.parser" )
145
177
146
178
# getting title
147
- title = str .strip (soup .find ('div' , {"class" : "production_title focus_title" }).findChildren ("h2" )[0 ].text )
179
+ title = str .strip (
180
+ soup .find ("div" , {"class" : "production_title focus_title" })
181
+ .findChildren ("h2" )[0 ]
182
+ .text
183
+ )
148
184
149
- date_string = str .strip (soup .find ('ul' , {"class" : "attributes" }).findChildren ("li" )[0 ].text )
185
+ date_string = str .strip (
186
+ soup .find ("ul" , {"class" : "attributes" }).findChildren ("li" )[0 ].text
187
+ )
150
188
151
189
release_date = None
152
190
153
191
try :
154
192
release_date = parse_date (date_string )
155
193
print (date_string , "->" , parse_date (date_string ))
156
194
except :
157
- print ("nodate" )
158
-
195
+ print ("COULDN'T PARSE DATE:" , date_string )
159
196
160
197
logger .write ("[INFO]" , " Adding: " + title + " ..." )
161
198
162
199
# getting developer
163
- developer = str .strip (soup .find ('div' , {"class" : "production_title focus_title" }).findChildren ("h3" )[0 ].findChildren ("a" )[0 ].text )
164
-
200
+ developer = str .strip (
201
+ soup .find ("div" , {"class" : "production_title focus_title" })
202
+ .findChildren ("h3" )[0 ]
203
+ .findChildren ("a" )[0 ]
204
+ .text
205
+ )
206
+
165
207
# fetching tag
166
- list_typetag = soup .find ('li' , {"class" : "signpost" })
208
+ list_typetag = soup .find ("li" , {"class" : "signpost" })
167
209
if list_typetag == None :
168
210
typetag = ""
169
211
else :
170
- typetag = str .strip (list_typetag .text if not isinstance (list_typetag , list ) else list_typetag [0 ].text )
171
-
212
+ typetag = str .strip (
213
+ list_typetag .text
214
+ if not isinstance (list_typetag , list )
215
+ else list_typetag [0 ].text
216
+ )
172
217
173
218
if "TRO" in typetag .upper () or "DEMO" in typetag .upper ():
174
219
typetag = "demo"
@@ -181,9 +226,9 @@ def scrape_page(slug, url, platform):
181
226
else :
182
227
logger .write ("[WARN]" , " We don't care about this category: " + typetag )
183
228
return - 1
184
-
229
+
185
230
# fetching screenshot
186
- screen_obj = soup .find ('a' , {"class" : "screenshot" })
231
+ screen_obj = soup .find ("a" , {"class" : "screenshot" })
187
232
if screen_obj is not None :
188
233
screenshot = screen_obj .get ("href" )
189
234
else :
@@ -196,7 +241,7 @@ def scrape_page(slug, url, platform):
196
241
source = source .get ("href" ) if source else ""
197
242
198
243
# fetching url (if present)
199
- url = soup .find ('ul' , {"class" : "download_links" })
244
+ url = soup .find ("ul" , {"class" : "download_links" })
200
245
if url is not None :
201
246
url = url .findChildren ("a" )
202
247
else :
@@ -210,7 +255,10 @@ def scrape_page(slug, url, platform):
210
255
elif len (url ) == 1 :
211
256
url = url [0 ].get ("href" )
212
257
if "modermodemet.se" in url :
213
- logger .write ("[ERR]" , " modermodemet.se is not available, and no other valid link has been found" )
258
+ logger .write (
259
+ "[ERR]" ,
260
+ " modermodemet.se is not available, and no other valid link has been found" ,
261
+ )
214
262
return - 1
215
263
elif len (url ) >= 2 :
216
264
# because almost always the prod will have the secondary mirror as scene.org or smth like that
@@ -221,19 +269,33 @@ def scrape_page(slug, url, platform):
221
269
# fetching video
222
270
video = soup .find (lambda tag : tag .name == "a" and "youtube" in tag .text .lower ())
223
271
video = video .get ("href" ) if video else ""
224
-
272
+
225
273
files = [f"{ slug } .{ platform .lower ()} " ]
226
274
227
- return Production (title , slug , developer , platform , typetag , screenshots , files , video , date = release_date , repository = source , url = demozoo_url )
275
+ return Production (
276
+ title ,
277
+ slug ,
278
+ developer ,
279
+ platform ,
280
+ typetag ,
281
+ screenshots ,
282
+ files ,
283
+ video ,
284
+ date = release_date ,
285
+ repository = source ,
286
+ url = demozoo_url ,
287
+ )
288
+
228
289
229
290
def main ():
230
291
for platform in PLATFORMS .keys ():
231
- logger .write ("[INFO]" ,"Parsing platform: " + platform )
292
+ logger .write ("[INFO]" , "Parsing platform: " + platform )
232
293
scrape (platform )
233
-
294
+
295
+
234
296
main ()
235
297
236
298
if utils .DEBUG :
237
- [ logger .write ("[TITLE]" , f ) for f in added ]
299
+ [logger .write ("[TITLE]" , f ) for f in added ]
238
300
239
- logger .write ("[INFO]" , "demozoo importer ended!" )
301
+ logger .write ("[INFO]" , "demozoo importer ended!" )
0 commit comments