22# CLI USAGE:
33# cd path/to/pelican/output/dir
44# ./image_preview_thumbnailer.py path/to/page.html
5+ # pylint: disable=attribute-defined-outside-init,redefined-builtin,redefined-outer-name,use-dict-literal
56import logging , os , re , sys , warnings
67from glob import glob
78try :
@@ -145,10 +146,11 @@ def process_link(img_downloader, anchor_tag, url_match, config=PluginConfig()):
145146 if matching_filepaths : # => a thumbnail has already been generated
146147 fs_thumb_filepath = matching_filepaths [0 ]
147148 else :
148- LOGGER .info ("Thumbnail does not exist => downloading image from %s" , anchor_tag ['href' ])
149+ LOGGER .info ("Thumbnail does not exist for %s => downloading image from %s" , thumb_filename , anchor_tag ['href' ])
149150 tmp_thumb_filepath = img_downloader (url_match , config )
150151 if not tmp_thumb_filepath : # => means the downloader failed to retrieve the image in a "supported" case
151- with open (config .fs_thumbs_dir (thumb_filename + '.none' ), 'w' , encoding = 'utf8' ):
152+ hostname = urlparse (anchor_tag ['href' ]).netloc
153+ with open (config .fs_thumbs_dir (f'{ thumb_filename } .{ hostname } .none' ), 'w' , encoding = 'utf8' ):
152154 pass
153155 return
154156 img_ext = os .path .splitext (tmp_thumb_filepath )[1 ]
@@ -158,7 +160,7 @@ def process_link(img_downloader, anchor_tag, url_match, config=PluginConfig()):
158160 os .rename (tmp_thumb_filepath , fs_thumb_filepath )
159161 # Under Windows, I have sometime seen a bit of delay for this operation to be performed,
160162 # which could trigger a FileNotFoundError on the line below, when calling getsize()
161- if not os .path .getsize (fs_thumb_filepath ): # .none file, meaning no thumbnail could be donwloaded
163+ if not os .path .getsize (fs_thumb_filepath ): # .none file, meaning no thumbnail could be downloaded
162164 return
163165 rel_thumb_filepath = fs_thumb_filepath .replace (config .output_path + '/' , '' ) if config .output_path else fs_thumb_filepath
164166 # Editing HTML on-the-fly to insert an <img> after the <a>:
@@ -331,8 +333,12 @@ def http_get(url, config=PluginConfig()):
331333 if response .status_code != 200 and config .silent_http_errors :
332334 LOGGER .error ('%s HTTP error when fetching %s' , response .status_code , url )
333335 return None
336+ if response .status_code != 200 :
337+ LOGGER .debug (response .text )
334338 if response .status_code != 200 and b'captcha' in response .content :
335339 LOGGER .warning ('CAPTCHA is likely to be required by page %s' , url )
340+ if response .status_code != 200 and b'CloudFront' in response .content :
341+ LOGGER .warning ('CloudFront is blocking request %s' , url )
336342 response .raise_for_status ()
337343 return response
338344
@@ -351,9 +357,8 @@ def register():
351357 signals .content_written .connect (process_all_links )
352358
353359
354- if __name__ == '__main__' :
355- html_filepath = sys .argv [1 ]
356- logging .basicConfig (format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" ,
360+ def main (html_filepath ):
361+ logging .basicConfig (format = "%(asctime)s [%(levelname)s] %(name)s (pid:%(process)s) %(message)s" ,
357362 datefmt = "%H:%M:%S" , level = logging .DEBUG )
358363 config = PluginConfig (dict (
359364 selector = 'article ul ul, h2:nth-of-type(3) + ul, h2:nth-of-type(4) + ul' ,
@@ -375,3 +380,6 @@ def register():
375380 # URL_MATCH = re.compile(...).match(URL)
376381 # print(pixabay_download_img(URL_MATCH))
377382 # process_link(pixabay_download_img, {'href': URL}, URL_MATCH)
383+
384+ if __name__ == '__main__' :
385+ main (sys .argv [1 ])
0 commit comments