Skip to content

Commit 1a48b52

Browse files
authored
Fix Demozoo importer (#257)
1 parent 8c27d1a commit 1a48b52

File tree

3 files changed

+119
-144
lines changed

3 files changed

+119
-144
lines changed

scrapers/py_importers/demozoo.py

+16-19
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,8 @@
33
# URL is structured in this way:
44
# https://demozoo.org/productions/?platform={internal_no_platform}&production_type={internal_prodtype_number}
55

6-
import sys
7-
import re
8-
import os
9-
import json
10-
import shutil
11-
import zipfile
12-
import fnmatch
13-
import urllib3
6+
147
import requests
15-
import unicodedata
16-
from urllib.request import urlopen
178
from bs4 import BeautifulSoup
189

1910
from py_common.Logger import Logger
@@ -71,13 +62,15 @@ def scrape(platform):
7162
page = requests.get(baseurl + "/productions/?platform=" + str(PLATFORMS[platform][0]) + "&page=1", timeout=None)
7263
soup = BeautifulSoup(page.content, 'html.parser')
7364

74-
# get total number of pages
75-
span_pages = soup.find("span", {"class":"current"})
76-
numberofpages = int(str.strip(span_pages.text).split(" ")[-1].split(".")[0])
77-
logger.write("[INFO]", "Total number of pages: " + str(numberofpages) )
78-
7965
# parsing every page
80-
for i in range(0, numberofpages):
66+
enough_page = True
67+
i = 0
68+
while enough_page:
69+
if soup.find('a', {"title": "Next_page"}):
70+
enough_page = True
71+
else:
72+
enough_page = False
73+
8174
logger.write("[INFO]", "Parsing page: " + str(i+1) )
8275
#TODO: dont call twice this page, as it is called before
8376

@@ -107,7 +100,7 @@ def scrape(platform):
107100

108101
# check if it could be added to database or not
109102
# building files
110-
ret = utils.build(prod, entrypath, ["GB", "GBC"]) # TODO: GBA, add GBA to this list
103+
ret = utils.build(prod, entrypath, ["gb", "gbc"]) # TODO: GBA, add GBA to this list
111104

112105
# make required JSON file
113106
if ret != 1:
@@ -165,7 +158,7 @@ def scrape_page(slug, url, platform):
165158

166159
# fetching screenshot
167160
screen_obj = soup.find('a', {"class": "screenshot"})
168-
if screen_obj != None:
161+
if screen_obj is not None:
169162
screenshot = screen_obj.get("href")
170163
else:
171164
screenshot = "None"
@@ -178,7 +171,7 @@ def scrape_page(slug, url, platform):
178171

179172
# fetching url (if present)
180173
url = soup.find('ul', {"class": "download_links"})
181-
if url != None:
174+
if url is not None:
182175
url = url.findChildren("a")
183176
else:
184177
# it doesn't make any sense to have a prod without DL link
@@ -196,11 +189,15 @@ def scrape_page(slug, url, platform):
196189
elif len(url) >= 2:
197190
# because almost always the prod will have the secondary mirror as scene.org or smth like that
198191
url = url[1].get("href")
192+
if "scene.org" in url and "view" in url:
193+
url = url.replace("view", "get")
199194

200195
# fetching video
201196
video = soup.find(lambda tag: tag.name == "a" and "youtube" in tag.text.lower())
202197
video = video.get("href") if video else ""
203198

199+
files = [f"{slug}.{platform.lower()}"]
200+
204201
return Production(title, slug, developer, platform, typetag, screenshots, files, video, repository=source, url=url)
205202

206203
def main():

scrapers/py_importers/py_common/utils.py

+102-125
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,20 @@
1-
import sys
2-
import py_common.utils
31
import re
42
import json
53
import shutil
64
import zipfile
75
import fnmatch
8-
import urllib3
96
import requests
10-
import unicodedata
117
import contextlib
128
import urllib
13-
from urllib.request import urlopen
14-
import imghdr
159
from PIL import Image
1610

1711
import os
1812
from os import listdir
19-
from os.path import isfile, join
2013

21-
from bs4 import BeautifulSoup
2214
from unidecode import unidecode
2315

2416
from py_common.Logger import Logger
25-
from py_common.Production import Production
17+
import py7zr
2618

2719
###########################
2820
### GLOBAL VAR AND CONS ###
@@ -115,139 +107,124 @@ def fetch_prod_name(prod, suffix, filepath):
115107
return path
116108

117109

118-
def build(prod: Production, entrypath: str, desired_extentions: list):
110+
111+
def build(prod, entrypath: str, desired_extensions: list):
119112
'''
120-
given a prod "Production" object containing
121-
all production's data, create a proper named folder, fetches all files (screenshot + rom)
122-
and properly organize everything
113+
Given a prod "Production" object containing
114+
all production's data, create a properly named folder, fetch all files (screenshot + ROM),
115+
and organize everything.
123116
'''
124-
if not os.path.exists(entrypath + prod.slug):
125-
#############
126-
# PROD FILE #
127-
#############
128-
# make its own folder
129-
os.mkdir(entrypath + prod.slug, 0o777)
130-
131-
# figuring out the suffix
132-
suffix = str.lower(prod.url.split(".")[-1])
133-
if suffix not in desired_extentions:
134-
suffix = "gb"
135-
136-
# building the filepath
137-
filepath = entrypath + prod.slug + "/"
138-
139-
# download the file
140-
# in case of http
141-
if prod.url.startswith("http"):
142-
try:
143-
r = requests.get(prod.url, allow_redirects=True,
144-
timeout=None, verify=False, headers=headers)
145-
if r.status_code != 200:
146-
logger.write("[ERR]:", str(r.status_code) +
147-
": " + prod.slug + " - " + prod.url)
148-
149-
# cleaning in case of error
150-
shutil.rmtree(entrypath + prod.slug)
151-
return 1
152-
except ConnectionError as e:
153-
logger.write("[ERR]:", str(r.status_code) +
154-
": " + prod.slug + " - " + prod.url)
155-
logger.write("[ERR]:", "REASON: " + e)
156-
157-
# cleaning in case of error
158-
shutil.rmtree(entrypath + prod.slug)
159-
return 1
160-
open(filepath + prod.slug + "." + suffix, 'wb').write(r.content)
161-
else:
162-
with contextlib.closing(urllib.request.urlopen(prod.url)) as r:
163-
with open(filepath + prod.slug + "." + suffix, 'wb') as f:
164-
shutil.copyfileobj(r, f)
165-
166-
# unzip in case of zip
167-
if prod.url.endswith(".zip") or prod.url.endswith(".ZIP"):
168-
# download and unzip
169-
try:
170-
with zipfile.ZipFile(filepath + prod.slug + "." + suffix, "r") as zip_ref:
171-
zip_ref.extractall(filepath + "unzippedfolder")
117+
# Create folder if not already present
118+
target_folder = os.path.join(entrypath, prod.slug)
119+
if not os.path.exists(target_folder):
120+
os.mkdir(target_folder, 0o777)
172121

173-
# manage all extensions, and it doesn't matter if they have uppercase or lowercase
174-
path = [] # eventually the file
122+
# Extract file extension
123+
suffix = prod.url.split(".")[-1].lower()
124+
125+
if suffix not in desired_extensions and suffix not in ["zip", "7z", "mp4"]:
126+
print(f"ERROR: {prod.slug} extension is not in {desired_extensions}")
127+
suffix = "gb" # Fallback extension
175128

176-
extentions = fix_extentions(desired_extentions)
177-
for extension in extentions:
178-
path = fetch_prod_name(prod, extension, filepath)
179-
if path != []:
180-
break
129+
# Build the file path
130+
filepath = os.path.join(target_folder, f"{prod.slug}.{suffix}")
181131

182-
# proper renaming and moving the file
183-
if path != []:
184-
os.rename(path[0], filepath + prod.slug +
185-
"." + extension.lower())
132+
# Download the file
133+
try:
134+
if prod.url.startswith("http"):
135+
r = requests.get(prod.url, allow_redirects=True, timeout=None, verify=False)
136+
if r.status_code != 200:
137+
raise Exception(f"HTTP Error {r.status_code}")
138+
with open(filepath, 'wb') as f:
139+
f.write(r.content)
140+
else:
141+
with contextlib.closing(urllib.request.urlopen(prod.url)) as r:
142+
with open(filepath, 'wb') as f:
143+
shutil.copyfileobj(r, f)
144+
except Exception as e:
145+
logger.write("[ERR]:", f"Error downloading {prod.slug}: {e}")
146+
shutil.rmtree(target_folder)
147+
return 1
148+
149+
# Unzip and handle files
150+
if suffix in ["zip", "7z"]:
151+
unzipped_path = os.path.join(target_folder, "unzippedfolder")
152+
os.makedirs(unzipped_path, exist_ok=True)
186153

187-
# update production object file
188-
prod.files.append(prod.slug + "." + extension.lower())
189-
else:
190-
logger.write(
191-
"[WARN]", prod.title + " extension is not a " + prod.platform + " file.")
192-
shutil.rmtree(entrypath + prod.slug)
193-
return 1
194-
195-
# cleaning up unneeded files
196-
shutil.rmtree(filepath + "unzippedfolder")
197-
if CLEANZIP:
198-
os.remove(filepath + prod.slug + "." + "zip")
199-
except zipfile.BadZipFile as e:
200-
logger.write("[ERR] ", str(e) + " bad zip file")
201-
shutil.rmtree(entrypath + prod.slug)
154+
try:
155+
if suffix == "zip":
156+
with zipfile.ZipFile(filepath, "r") as zip_ref:
157+
zip_ref.extractall(unzipped_path)
158+
elif suffix == "7z":
159+
with py7zr.SevenZipFile(filepath, mode='r') as z:
160+
z.extractall(unzipped_path)
161+
except Exception as e:
162+
logger.write("[ERR]:", f"Failed to extract {suffix} file: {e}")
163+
shutil.rmtree(target_folder)
202164
return 1
203-
else:
204-
# it is a proper gb file -> just write the filename in its own structure field
205-
pass
206-
207-
# download the screenshot
208-
if prod.screenshots != None and prod.screenshots != [] and prod.screenshots[0] != "None":
209-
r = requests.get(
210-
prod.screenshots[0], allow_redirects=True, timeout=None)
211-
212-
# figuring out what kind of screenshots I am dealing with
213-
screen_file_path = filepath + prod.slug + "."
214-
215-
# screenshot fileext
216-
screen_ext = prod.screenshots[0].split(".")[-1]
217-
logger.write("[INFO]", " The screenshot is in " +
218-
screen_ext + " format")
219165

220-
if screen_ext.lower() == "png":
221-
screen_file_path += "png"
222-
else:
223-
screen_file_path += screen_ext
224-
225-
open(screen_file_path, 'wb').write(r.content)
166+
# Search for desired extensions in the extracted folder
167+
valid_file_found = False
168+
169+
# Recursively search all files under the unzipped path
170+
for root, _, files in os.walk(unzipped_path):
171+
for file in files:
172+
ext = file.split(".")[-1].lower()
173+
if ext in desired_extensions:
174+
extracted_file = os.path.join(root, file)
175+
final_file = os.path.join(target_folder, f"{prod.slug}.{ext}")
176+
177+
# Move the valid file to the target folder
178+
shutil.move(extracted_file, final_file)
179+
prod.files.append(f"{prod.slug}.{ext}")
180+
181+
valid_file_found = True
182+
break
183+
184+
if valid_file_found:
185+
break
226186

227-
if screen_ext != "png":
228-
im = Image.open(screen_file_path).convert("RGB")
229-
im.save(filepath + prod.slug + ".png", "png")
187+
if not valid_file_found:
188+
logger.write("[WARN]:", f"No valid files with extensions {desired_extensions} found.")
189+
shutil.rmtree(target_folder)
190+
return 1
230191

231-
logger.write(
232-
"[INFO]", " Screenshot has been converted into a PNG file.")
233-
logger.write("[INFO]", " Removing screenshot " +
234-
screen_ext + " file...")
192+
# Clean up unzipped files and original archive
193+
shutil.rmtree(unzipped_path)
194+
if CLEANZIP:
195+
os.remove(filepath)
196+
else:
197+
prod.files.append(f"{prod.slug}.{suffix}")
235198

236-
os.remove(screen_file_path)
199+
# Handle screenshots
200+
if prod.screenshots and prod.screenshots[0] != "None":
201+
try:
202+
r = requests.get(prod.screenshots[0], allow_redirects=True, timeout=None)
203+
screen_ext = prod.screenshots[0].split(".")[-1].lower()
204+
screen_file = os.path.join(target_folder, f"{prod.slug}.{screen_ext}")
205+
with open(screen_file, 'wb') as f:
206+
f.write(r.content)
207+
208+
# Convert to PNG if necessary
209+
if screen_ext != "png":
210+
img = Image.open(screen_file).convert("RGB")
211+
png_file = os.path.join(target_folder, f"{prod.slug}.png")
212+
img.save(png_file, "PNG")
213+
os.remove(screen_file)
214+
prod.screenshots[0] = f"{prod.slug}.png"
215+
else:
216+
prod.screenshots[0] = f"{prod.slug}.png"
217+
except Exception as e:
218+
logger.write("[ERR]:", f"Failed to download screenshot for {prod.slug}: {e}")
219+
prod.screenshots = []
237220

238-
open(filepath + prod.slug + "." + "png", 'wb').write(r.content)
239-
prod.screenshots[0] = prod.slug + "." + "png"
240-
else:
241-
prod.screenshots = []
242-
logger.write(
243-
"[INFO]", "Screenshot not present for this production")
244221
else:
245-
logger.write(
246-
"[WARN]", "directory already present. Skipping " + prod.slug + "...")
222+
logger.write("[WARN]:", f"Directory already exists for {prod.slug}. Skipping...")
247223
return 1
248224
return 0
249225

250226

227+
251228
def fix_extentions(desired_extentions):
252229
'''
253230
given a theorical list of extensions, it returns a list containing additional correct extensions (like CGB, AGB)

scrapers/py_importers/requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ webencodings==0.5.1
1212
wget==3.2
1313
webptools==0.0.5
1414
pillow==8.3.2
15+
py7zr==0.22.0

0 commit comments

Comments
 (0)