Skip to content

Commit aafde54

Browse files
committed
refactor(core): eliminate async safety violations and migrate to pathlib
Eliminate all blocking I/O operations in async contexts and modernize file path handling by migrating from os.path to pathlib.Path. BREAKING CHANGE: None - internal refactoring only - Enable ASYNC210, ASYNC230, ASYNC240, ASYNC250 Ruff rules - Wrap blocking urllib.request.urlopen() in run_in_executor - Wrap blocking file operations (open, write) in run_in_executor - Replace blocking os.path calls with async helpers using run_in_executor - Replace blocking input() with await ainput() - Migrate extract.py from os.path to pathlib.Path - Use Path() constructor and / operator for path joining - Use Path.mkdir(), Path.rename() in executor instead of os functions - Create mockable _path_exists() and _path_is_dir() helpers - Add debug logging for all file system operations
1 parent 651c894 commit aafde54

File tree

8 files changed

+300
-115
lines changed

8 files changed

+300
-115
lines changed

pyproject.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -226,10 +226,6 @@ select = [
226226
]
227227
ignore = [
228228
"ANN401", # Dynamically typed expressions (typing.Any) are disallowed
229-
"ASYNC210", # TODO Async functions should not call blocking HTTP methods
230-
"ASYNC230", # TODO Async functions should not open files with blocking methods like `open`
231-
"ASYNC240", # TODO Async functions should not use os.path methods, use trio.Path or anyio.path
232-
"ASYNC250", # TODO Blocking call to input() in async context
233229
"COM812", # Trailing comma missing
234230
"D1", # Missing docstring in ...
235231
"D200", # One-line docstring should fit on one line

src/kleinanzeigen_bot/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -937,7 +937,7 @@ async def publish_ad(self, ad_file:str, ad_cfg:Ad, ad_cfg_orig:dict[str, Any], p
937937
LOG.warning("# Payment form detected! Please proceed with payment.")
938938
LOG.warning("############################################")
939939
await self.web_scroll_page_down()
940-
input(_("Press a key to continue..."))
940+
await ainput(_("Press a key to continue...")) # noqa: ASYNC240
941941
except TimeoutError:
942942
pass
943943

@@ -1108,7 +1108,7 @@ async def __set_shipping(self, ad_cfg:Ad, mode:AdUpdateStrategy = AdUpdateStrate
11081108
# in some categories we need to go another dialog back
11091109
try:
11101110
await self.web_find(By.XPATH, '//dialog//button[contains(., "Andere Versandmethoden")]',
1111-
timeout=short_timeout)
1111+
timeout = short_timeout)
11121112
except TimeoutError:
11131113
await self.web_click(By.XPATH, '//dialog//button[contains(., "Zurück")]')
11141114

src/kleinanzeigen_bot/extract.py

Lines changed: 72 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
22
# SPDX-License-Identifier: AGPL-3.0-or-later
33
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
4+
import asyncio
45
from gettext import gettext as _
56

6-
import json, mimetypes, os, re, shutil # isort: skip
7+
import json, mimetypes, re, shutil # isort: skip
78
import urllib.request as urllib_request
89
from datetime import datetime
10+
from pathlib import Path
911
from typing import Any, Final
1012

1113
from kleinanzeigen_bot.model.ad_model import ContactPartial
@@ -25,6 +27,28 @@
2527
BREADCRUMB_RE = re.compile(r"/c(\d+)")
2628

2729

30+
def _path_exists(path:Path | str) -> bool:
31+
"""Helper for Path.exists() that can be mocked in tests."""
32+
return Path(path).exists()
33+
34+
35+
def _path_is_dir(path:Path | str) -> bool:
36+
"""Helper for Path.is_dir() that can be mocked in tests."""
37+
return Path(path).is_dir()
38+
39+
40+
async def _exists(path:Path | str) -> bool:
41+
result = await asyncio.get_running_loop().run_in_executor(None, _path_exists, path) # noqa: ASYNC240
42+
LOG.debug("Path exists check: %s -> %s", path, result)
43+
return result
44+
45+
46+
async def _isdir(path:Path | str) -> bool:
47+
result = await asyncio.get_running_loop().run_in_executor(None, _path_is_dir, path) # noqa: ASYNC240
48+
LOG.debug("Path is_dir check: %s -> %s", path, result)
49+
return result
50+
51+
2852
class AdExtractor(WebScrapingMixin):
2953
"""
3054
Wrapper class for ad extraction that uses an active bot´s browser session to extract specific elements from an ad page.
@@ -44,10 +68,11 @@ async def download_ad(self, ad_id:int) -> None:
4468
"""
4569

4670
# create sub-directory for ad(s) to download (if necessary):
47-
relative_directory = "downloaded-ads"
71+
relative_directory = Path("downloaded-ads")
4872
# make sure configured base directory exists
49-
if not os.path.exists(relative_directory) or not os.path.isdir(relative_directory):
50-
os.mkdir(relative_directory)
73+
if not await _exists(relative_directory) or not await _isdir(relative_directory):
74+
LOG.debug("Creating base directory: %s", relative_directory)
75+
await asyncio.get_running_loop().run_in_executor(None, relative_directory.mkdir)
5176
LOG.info("Created ads directory at ./%s.", relative_directory)
5277

5378
# Extract ad info and determine final directory path
@@ -56,12 +81,26 @@ async def download_ad(self, ad_id:int) -> None:
5681
)
5782

5883
# Save the ad configuration file
59-
ad_file_path = final_dir + "/" + f"ad_{ad_id}.yaml"
84+
ad_file_path = str(Path(final_dir) / f"ad_{ad_id}.yaml")
6085
dicts.save_dict(
6186
ad_file_path,
6287
ad_cfg.model_dump(),
6388
header = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json")
6489

90+
@staticmethod
91+
def _download_and_save_image_sync(url:str, directory:str, filename_prefix:str, img_nr:int) -> str | None:
92+
try:
93+
with urllib_request.urlopen(url) as response: # noqa: S310 Audit URL open for permitted schemes.
94+
content_type = response.info().get_content_type()
95+
file_ending = mimetypes.guess_extension(content_type) or ""
96+
img_path = f"{directory}/{filename_prefix}{img_nr}{file_ending}"
97+
with open(img_path, "wb") as f:
98+
shutil.copyfileobj(response, f)
99+
return img_path
100+
except Exception as e:
101+
LOG.warning("Failed to download image %s: %s", url, e)
102+
return None
103+
65104
async def _download_images_from_ad_page(self, directory:str, ad_id:int) -> list[str]:
66105
"""
67106
Downloads all images of an ad.
@@ -85,17 +124,23 @@ async def _download_images_from_ad_page(self, directory:str, ad_id:int) -> list[
85124
img_nr = 1
86125
dl_counter = 0
87126

127+
loop = asyncio.get_running_loop()
128+
88129
for img_element in images:
89130
current_img_url = img_element.attrs["src"] # URL of the image
90131
if current_img_url is None:
91132
continue
92133

93-
with urllib_request.urlopen(str(current_img_url)) as response: # noqa: S310 Audit URL open for permitted schemes.
94-
content_type = response.info().get_content_type()
95-
file_ending = mimetypes.guess_extension(content_type)
96-
img_path = f"{directory}/{img_fn_prefix}{img_nr}{file_ending}"
97-
with open(img_path, "wb") as f:
98-
shutil.copyfileobj(response, f)
134+
img_path = await loop.run_in_executor(
135+
None,
136+
self._download_and_save_image_sync,
137+
str(current_img_url),
138+
directory,
139+
img_fn_prefix,
140+
img_nr
141+
)
142+
143+
if img_path:
99144
dl_counter += 1
100145
img_paths.append(img_path.rsplit("/", maxsplit = 1)[-1])
101146

@@ -354,8 +399,8 @@ async def _extract_ad_page_info(self, directory:str, ad_id:int) -> AdPartial:
354399
return ad_cfg
355400

356401
async def _extract_ad_page_info_with_directory_handling(
357-
self, relative_directory:str, ad_id:int
358-
) -> tuple[AdPartial, str]:
402+
self, relative_directory:Path, ad_id:int
403+
) -> tuple[AdPartial, Path]:
359404
"""
360405
Extracts ad information and handles directory creation/renaming.
361406
@@ -373,32 +418,37 @@ async def _extract_ad_page_info_with_directory_handling(
373418

374419
# Determine the final directory path
375420
sanitized_title = misc.sanitize_folder_name(title, self.config.download.folder_name_max_length)
376-
final_dir = os.path.join(relative_directory, f"ad_{ad_id}_{sanitized_title}")
377-
temp_dir = os.path.join(relative_directory, f"ad_{ad_id}")
421+
final_dir = relative_directory / f"ad_{ad_id}_{sanitized_title}"
422+
temp_dir = relative_directory / f"ad_{ad_id}"
423+
424+
loop = asyncio.get_running_loop()
378425

379426
# Handle existing directories
380-
if os.path.exists(final_dir):
427+
if await _exists(final_dir):
381428
# If the folder with title already exists, delete it
382429
LOG.info("Deleting current folder of ad %s...", ad_id)
383-
shutil.rmtree(final_dir)
430+
LOG.debug("Removing directory tree: %s", final_dir)
431+
await loop.run_in_executor(None, shutil.rmtree, str(final_dir))
384432

385-
if os.path.exists(temp_dir):
433+
if await _exists(temp_dir):
386434
if self.config.download.rename_existing_folders:
387435
# Rename the old folder to the new name with title
388436
LOG.info("Renaming folder from %s to %s for ad %s...",
389-
os.path.basename(temp_dir), os.path.basename(final_dir), ad_id)
390-
os.rename(temp_dir, final_dir)
437+
temp_dir.name, final_dir.name, ad_id)
438+
LOG.debug("Renaming: %s -> %s", temp_dir, final_dir)
439+
await loop.run_in_executor(None, temp_dir.rename, final_dir)
391440
else:
392441
# Use the existing folder without renaming
393442
final_dir = temp_dir
394443
LOG.info("Using existing folder for ad %s at %s.", ad_id, final_dir)
395444
else:
396445
# Create new directory with title
397-
os.mkdir(final_dir)
446+
LOG.debug("Creating new directory: %s", final_dir)
447+
await loop.run_in_executor(None, final_dir.mkdir)
398448
LOG.info("New directory for ad created at %s.", final_dir)
399449

400450
# Now extract complete ad info (including images) to the final directory
401-
ad_cfg = await self._extract_ad_page_info(final_dir, ad_id)
451+
ad_cfg = await self._extract_ad_page_info(str(final_dir), ad_id)
402452

403453
return ad_cfg, final_dir
404454

src/kleinanzeigen_bot/resources/translations.de.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,9 @@ kleinanzeigen_bot/extract.py:
175175
download_ad:
176176
"Created ads directory at ./%s.": "Verzeichnis für Anzeigen erstellt unter ./%s."
177177

178+
_download_and_save_image_sync:
179+
"Failed to download image %s: %s": "Fehler beim Herunterladen des Bildes %s: %s"
180+
178181
_download_images_from_ad_page:
179182
"Found %s.": "%s gefunden."
180183
"Downloaded %s.": "%s heruntergeladen."

src/kleinanzeigen_bot/utils/web_scraping_mixin.py

Lines changed: 39 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,41 @@ def __init__(self) -> None:
100100
self.profile_name:str | None = None
101101

102102

103+
def _write_initial_prefs(prefs_file:str) -> None:
104+
with open(prefs_file, "w", encoding = "UTF-8") as fd:
105+
json.dump({
106+
"credentials_enable_service": False,
107+
"enable_do_not_track": True,
108+
"google": {
109+
"services": {
110+
"consented_to_sync": False
111+
}
112+
},
113+
"profile": {
114+
"default_content_setting_values": {
115+
"popups": 0,
116+
"notifications": 2 # 1 = allow, 2 = block browser notifications
117+
},
118+
"password_manager_enabled": False
119+
},
120+
"signin": {
121+
"allowed": False
122+
},
123+
"translate_site_blacklist": [
124+
"www.kleinanzeigen.de"
125+
],
126+
"devtools": {
127+
"preferences": {
128+
"currentDockState": '"bottom"'
129+
}
130+
}
131+
}, fd)
132+
133+
134+
async def _exists(path:str) -> bool:
135+
return await asyncio.get_running_loop().run_in_executor(None, os.path.exists, path)
136+
137+
103138
class WebScrapingMixin:
104139

105140
def __init__(self) -> None:
@@ -174,7 +209,7 @@ async def create_browser_session(self) -> None:
174209
LOG.info("Creating Browser session...")
175210

176211
if self.browser_config.binary_location:
177-
ensure(os.path.exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
212+
ensure(await _exists(self.browser_config.binary_location), f"Specified browser binary [{self.browser_config.binary_location}] does not exist.")
178213
else:
179214
self.browser_config.binary_location = self.get_compatible_browser()
180215
LOG.info(" -> Browser binary location: %s", self.browser_config.binary_location)
@@ -289,41 +324,14 @@ async def create_browser_session(self) -> None:
289324
profile_dir = os.path.join(cfg.user_data_dir, self.browser_config.profile_name or "Default")
290325
os.makedirs(profile_dir, exist_ok = True)
291326
prefs_file = os.path.join(profile_dir, "Preferences")
292-
if not os.path.exists(prefs_file):
327+
if not await _exists(prefs_file):
293328
LOG.info(" -> Setting chrome prefs [%s]...", prefs_file)
294-
with open(prefs_file, "w", encoding = "UTF-8") as fd:
295-
json.dump({
296-
"credentials_enable_service": False,
297-
"enable_do_not_track": True,
298-
"google": {
299-
"services": {
300-
"consented_to_sync": False
301-
}
302-
},
303-
"profile": {
304-
"default_content_setting_values": {
305-
"popups": 0,
306-
"notifications": 2 # 1 = allow, 2 = block browser notifications
307-
},
308-
"password_manager_enabled": False
309-
},
310-
"signin": {
311-
"allowed": False
312-
},
313-
"translate_site_blacklist": [
314-
"www.kleinanzeigen.de"
315-
],
316-
"devtools": {
317-
"preferences": {
318-
"currentDockState": '"bottom"'
319-
}
320-
}
321-
}, fd)
329+
await asyncio.get_running_loop().run_in_executor(None, _write_initial_prefs, prefs_file)
322330

323331
# load extensions
324332
for crx_extension in self.browser_config.extensions:
325333
LOG.info(" -> Adding Browser extension: [%s]", crx_extension)
326-
ensure(os.path.exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
334+
ensure(await _exists(crx_extension), f"Configured extension-file [{crx_extension}] does not exist.")
327335
cfg.add_extension(crx_extension)
328336

329337
try:

0 commit comments

Comments
 (0)