11# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
22# SPDX-License-Identifier: AGPL-3.0-or-later
33# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
4+ import asyncio
45from gettext import gettext as _
56
6- import json , mimetypes , os , re , shutil # isort: skip
7+ import json , mimetypes , re , shutil # isort: skip
78import urllib .request as urllib_request
89from datetime import datetime
10+ from pathlib import Path
911from typing import Any , Final
1012
1113from kleinanzeigen_bot .model .ad_model import ContactPartial
2527BREADCRUMB_RE = re .compile (r"/c(\d+)" )
2628
2729
30+ def _path_exists (path :Path | str ) -> bool :
31+ """Helper for Path.exists() that can be mocked in tests."""
32+ return Path (path ).exists ()
33+
34+
35+ def _path_is_dir (path :Path | str ) -> bool :
36+ """Helper for Path.is_dir() that can be mocked in tests."""
37+ return Path (path ).is_dir ()
38+
39+
40+ async def _exists (path :Path | str ) -> bool :
41+ result = await asyncio .get_running_loop ().run_in_executor (None , _path_exists , path ) # noqa: ASYNC240
42+ LOG .debug ("Path exists check: %s -> %s" , path , result )
43+ return result
44+
45+
46+ async def _isdir (path :Path | str ) -> bool :
47+ result = await asyncio .get_running_loop ().run_in_executor (None , _path_is_dir , path ) # noqa: ASYNC240
48+ LOG .debug ("Path is_dir check: %s -> %s" , path , result )
49+ return result
50+
51+
2852class AdExtractor (WebScrapingMixin ):
2953 """
3054 Wrapper class for ad extraction that uses an active bot´s browser session to extract specific elements from an ad page.
@@ -44,10 +68,11 @@ async def download_ad(self, ad_id:int) -> None:
4468 """
4569
4670 # create sub-directory for ad(s) to download (if necessary):
47- relative_directory = "downloaded-ads"
71+ relative_directory = Path ( "downloaded-ads" )
4872 # make sure configured base directory exists
49- if not os .path .exists (relative_directory ) or not os .path .isdir (relative_directory ):
50- os .mkdir (relative_directory )
73+ if not await _exists (relative_directory ) or not await _isdir (relative_directory ):
74+ LOG .debug ("Creating base directory: %s" , relative_directory )
75+ await asyncio .get_running_loop ().run_in_executor (None , relative_directory .mkdir )
5176 LOG .info ("Created ads directory at ./%s." , relative_directory )
5277
5378 # Extract ad info and determine final directory path
@@ -56,12 +81,26 @@ async def download_ad(self, ad_id:int) -> None:
5681 )
5782
5883 # Save the ad configuration file
59- ad_file_path = final_dir + "/" + f"ad_{ ad_id } .yaml"
84+ ad_file_path = str ( Path ( final_dir ) / f"ad_{ ad_id } .yaml" )
6085 dicts .save_dict (
6186 ad_file_path ,
6287 ad_cfg .model_dump (),
6388 header = "# yaml-language-server: $schema=https://raw.githubusercontent.com/Second-Hand-Friends/kleinanzeigen-bot/refs/heads/main/schemas/ad.schema.json" )
6489
90+ @staticmethod
91+ def _download_and_save_image_sync (url :str , directory :str , filename_prefix :str , img_nr :int ) -> str | None :
92+ try :
93+ with urllib_request .urlopen (url ) as response : # noqa: S310 Audit URL open for permitted schemes.
94+ content_type = response .info ().get_content_type ()
95+ file_ending = mimetypes .guess_extension (content_type ) or ""
96+ img_path = f"{ directory } /{ filename_prefix } { img_nr } { file_ending } "
97+ with open (img_path , "wb" ) as f :
98+ shutil .copyfileobj (response , f )
99+ return img_path
100+ except Exception as e :
101+ LOG .warning ("Failed to download image %s: %s" , url , e )
102+ return None
103+
65104 async def _download_images_from_ad_page (self , directory :str , ad_id :int ) -> list [str ]:
66105 """
67106 Downloads all images of an ad.
@@ -85,17 +124,23 @@ async def _download_images_from_ad_page(self, directory:str, ad_id:int) -> list[
85124 img_nr = 1
86125 dl_counter = 0
87126
127+ loop = asyncio .get_running_loop ()
128+
88129 for img_element in images :
89130 current_img_url = img_element .attrs ["src" ] # URL of the image
90131 if current_img_url is None :
91132 continue
92133
93- with urllib_request .urlopen (str (current_img_url )) as response : # noqa: S310 Audit URL open for permitted schemes.
94- content_type = response .info ().get_content_type ()
95- file_ending = mimetypes .guess_extension (content_type )
96- img_path = f"{ directory } /{ img_fn_prefix } { img_nr } { file_ending } "
97- with open (img_path , "wb" ) as f :
98- shutil .copyfileobj (response , f )
134+ img_path = await loop .run_in_executor (
135+ None ,
136+ self ._download_and_save_image_sync ,
137+ str (current_img_url ),
138+ directory ,
139+ img_fn_prefix ,
140+ img_nr
141+ )
142+
143+ if img_path :
99144 dl_counter += 1
100145 img_paths .append (img_path .rsplit ("/" , maxsplit = 1 )[- 1 ])
101146
@@ -354,8 +399,8 @@ async def _extract_ad_page_info(self, directory:str, ad_id:int) -> AdPartial:
354399 return ad_cfg
355400
356401 async def _extract_ad_page_info_with_directory_handling (
357- self , relative_directory :str , ad_id :int
358- ) -> tuple [AdPartial , str ]:
402+ self , relative_directory :Path , ad_id :int
403+ ) -> tuple [AdPartial , Path ]:
359404 """
360405 Extracts ad information and handles directory creation/renaming.
361406
@@ -373,32 +418,37 @@ async def _extract_ad_page_info_with_directory_handling(
373418
374419 # Determine the final directory path
375420 sanitized_title = misc .sanitize_folder_name (title , self .config .download .folder_name_max_length )
376- final_dir = os .path .join (relative_directory , f"ad_{ ad_id } _{ sanitized_title } " )
377- temp_dir = os .path .join (relative_directory , f"ad_{ ad_id } " )
421+ final_dir = relative_directory / f"ad_{ ad_id } _{ sanitized_title } "
422+ temp_dir = relative_directory / f"ad_{ ad_id } "
423+
424+ loop = asyncio .get_running_loop ()
378425
379426 # Handle existing directories
380- if os . path . exists (final_dir ):
427+ if await _exists (final_dir ):
381428 # If the folder with title already exists, delete it
382429 LOG .info ("Deleting current folder of ad %s..." , ad_id )
383- shutil .rmtree (final_dir )
430+ LOG .debug ("Removing directory tree: %s" , final_dir )
431+ await loop .run_in_executor (None , shutil .rmtree , str (final_dir ))
384432
385- if os . path . exists (temp_dir ):
433+ if await _exists (temp_dir ):
386434 if self .config .download .rename_existing_folders :
387435 # Rename the old folder to the new name with title
388436 LOG .info ("Renaming folder from %s to %s for ad %s..." ,
389- os .path .basename (temp_dir ), os .path .basename (final_dir ), ad_id )
390- os .rename (temp_dir , final_dir )
437+ temp_dir .name , final_dir .name , ad_id )
438+ LOG .debug ("Renaming: %s -> %s" , temp_dir , final_dir )
439+ await loop .run_in_executor (None , temp_dir .rename , final_dir )
391440 else :
392441 # Use the existing folder without renaming
393442 final_dir = temp_dir
394443 LOG .info ("Using existing folder for ad %s at %s." , ad_id , final_dir )
395444 else :
396445 # Create new directory with title
397- os .mkdir (final_dir )
446+ LOG .debug ("Creating new directory: %s" , final_dir )
447+ await loop .run_in_executor (None , final_dir .mkdir )
398448 LOG .info ("New directory for ad created at %s." , final_dir )
399449
400450 # Now extract complete ad info (including images) to the final directory
401- ad_cfg = await self ._extract_ad_page_info (final_dir , ad_id )
451+ ad_cfg = await self ._extract_ad_page_info (str ( final_dir ) , ad_id )
402452
403453 return ad_cfg , final_dir
404454
0 commit comments