33import sys
44import requests
55import time
6+ import random
67
78# Define the directories and their language labels
89DOCS_DIRS = {
9- 'de' : os .path .join (' docs' , 'de' ),
10- 'en' : os .path .join (' docs' , 'en' ),
10+ "de" : os .path .join (" docs" , "de" ),
11+ "en" : os .path .join (" docs" , "en" ),
1112}
1213
1314# Path to disallowed domains file
14- DISALLOWED_DOMAINS_FILE = os .path .join (' .github' , 'ci' , ' disallowed_image_domains.txt' )
15+ DISALLOWED_DOMAINS_FILE = os .path .join (" .github" , "ci" , " disallowed_image_domains.txt" )
1516
16- # Timeouts for external image check (in seconds)
17+ # Timeouts for external image check
1718HTTP_TIMEOUT = 3
18-
19- # Retry settings
2019MAX_RETRIES = 3
21- RETRY_DELAY = 3 # seconds
22-
23- # Custom headers to simulate a normal web browser request
24- HEADERS = {
25- 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ,
26- 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,
27- 'Accept-Encoding' : 'gzip, deflate, br' ,
28- 'Accept-Language' : 'en-US,en;q=0.9,de;q=0.8' ,
29- 'Connection' : 'keep-alive' ,
30- 'Upgrade-Insecure-Requests' : '1' ,
31- 'DNT' : '1' , # Do Not Track header
32- 'Cache-Control' : 'max-age=0' ,
33- 'Referer' : 'https://www.google.com/' , # Adding a generic referer
34- 'X-Requested-With' : 'XMLHttpRequest' ,
35- 'TE' : 'Trailers' ,
20+ RETRY_DELAY = 3
21+
22+ # Custom headers to simulate a normal web browser request (added more headers to bypass restrictions like Cloudflare)
23+ SESSION_HEADERS = {
24+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.50 Safari/537.36" ,
25+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" ,
26+ "Accept-Encoding" : "gzip, deflate, br" ,
27+ "Accept-Language" : "en-US,en;q=0.9,de;q=0.8" ,
28+ "Connection" : "keep-alive" ,
29+ "Upgrade-Insecure-Requests" : "1" ,
30+ "DNT" : "1" ,
31+ "Cache-Control" : "max-age=0" ,
32+ "X-Requested-With" : "XMLHttpRequest" ,
33+ "TE" : "Trailers" ,
34+ "Referer" : "https://www.portfolio-performance.info/" ,
35+ "Origin" : "https://www.portfolio-performance.info" ,
3636}
3737
38+ # Reusable session for all requests
39+ session = requests .Session ()
40+ session .headers .update (SESSION_HEADERS )
41+
42+
3843def load_disallowed_domains (file_path ):
3944 """Load disallowed domains from a text file into a set."""
4045 disallowed_domains = set ()
4146 if os .path .exists (file_path ):
42- with open (file_path , 'r' , encoding = ' utf-8' ) as f :
47+ with open (file_path , "r" , encoding = " utf-8" ) as f :
4348 for line in f :
4449 line = line .strip ()
4550 if line and not line .startswith ("#" ):
4651 disallowed_domains .add (line .lower ())
4752 return disallowed_domains
4853
54+
4955def is_disallowed_url (path , disallowed_domains ):
5056 """Return True if the URL contains any disallowed domain."""
5157 return any (domain in path .lower () for domain in disallowed_domains )
5258
59+
5360def find_markdown_files (base_dirs ):
5461 """Recursively find all markdown files in given directories."""
5562 md_files = []
5663 for lang , base_dir in base_dirs .items ():
5764 if not os .path .exists (base_dir ):
58- print (f"⚠️ Warning: Directory '{ base_dir } ' [{ lang } ] does not exist. Skipping." )
65+ print (
66+ f"⚠️ Warning: Directory '{ base_dir } ' [{ lang } ] does not exist. Skipping."
67+ )
5968 continue
6069 for root , dirs , files in os .walk (base_dir ):
6170 for file in files :
6271 if file .endswith (".md" ):
6372 md_files .append ((lang , os .path .join (root , file )))
6473 return md_files
6574
75+
6676def extract_image_paths_with_line_numbers (md_file ):
6777 """Extract all image references from a markdown file with their line numbers."""
6878 image_paths = []
69- pattern = re .compile (r' !\[[^\]]*\]\(([^)]+)\)' , re .MULTILINE | re .IGNORECASE )
79+ pattern = re .compile (r" !\[[^\]]*\]\(([^)]+)\)" , re .MULTILINE | re .IGNORECASE )
7080
71- with open (md_file , 'r' , encoding = ' utf-8' ) as f :
81+ with open (md_file , "r" , encoding = " utf-8" ) as f :
7282 for line_num , line in enumerate (f , 1 ):
7383 matches = pattern .findall (line )
7484 for path in matches :
7585 image_paths .append ((line_num , path .strip ()))
7686 return image_paths
7787
88+
7889def validate_external_image (path , checked_urls ):
7990 """Validate an external image URL by checking availability and content type."""
80- session = requests .Session ()
81- session .headers .update (HEADERS )
8291
8392 if path in checked_urls :
8493 return checked_urls [path ]
@@ -87,45 +96,54 @@ def validate_external_image(path, checked_urls):
8796 try :
8897 resp = session .head (path , timeout = HTTP_TIMEOUT , allow_redirects = True )
8998
99+ if resp .status_code in (403 , 405 ):
100+ print (
101+ f"⚠️ Warning: Access denied (HTTP { resp .status_code } ) while accessing { path } ."
102+ )
103+ checked_urls [path ] = f"HTTP { resp .status_code } "
104+ return f"HTTP { resp .status_code } "
105+
106+ if "image/" not in resp .headers .get ("Content-Type" , "" ).lower ():
107+ # Fallback to GET immediately if HEAD is blocked OR doesn't return content type
108+ resp = session .get (path , timeout = HTTP_TIMEOUT , allow_redirects = True )
109+
90110 if resp .status_code == 429 :
91- print (f"⚠️ Rate-limited (HTTP 429) while accessing { path } --> Retrying after { RETRY_DELAY } s..." )
111+ print (
112+ f"⚠️ Warning: Rate-limited (HTTP 429) while accessing { path } --> Retrying after { RETRY_DELAY } s..."
113+ )
92114 time .sleep (RETRY_DELAY )
93115 continue
94- elif resp .status_code == 403 :
95- print (f"⚠️ Access forbidden (HTTP 403) for { path } " )
96- checked_urls [path ] = 'HTTP 403'
97- return 'HTTP 403'
98116 elif resp .status_code == 404 :
99- print (f"❌ Error while accessing { path } --> HTTP 404 (Not Found)" )
100- checked_urls [path ] = ' HTTP 404'
101- return ' HTTP 404'
117+ print (f"❌ Error: HTTP 404 (Not Found) while accessing { path } . " )
118+ checked_urls [path ] = " HTTP 404"
119+ return " HTTP 404"
102120 elif resp .status_code >= 400 :
103- print (f"⚠️ Error while accessing { path } --> HTTP { resp . status_code } " )
104- checked_urls [path ] = f' HTTP { resp .status_code } '
105- return f' HTTP { resp .status_code } '
121+ print (f"⚠️ Error: HTTP { resp . status_code } while accessing { path } . " )
122+ checked_urls [path ] = f" HTTP { resp .status_code } "
123+ return f" HTTP { resp .status_code } "
106124
107- # Fallback to GET if HEAD fails with suspicious content-type
108- content_type = resp .headers .get ('Content-Type' , '' ).lower ()
109- if not content_type .startswith ('image/' ):
110- resp = session .get (path , timeout = HTTP_TIMEOUT , allow_redirects = True )
111- content_type = resp .headers .get ('Content-Type' , '' ).lower ()
125+ content_type = resp .headers .get ("Content-Type" , "" ).lower ()
126+ if not content_type .startswith ("image/" ):
127+ checked_urls [path ] = f"unexpected content type ({ content_type } )"
128+ return f"unexpected content type ({ content_type } )"
112129
113- if not content_type .startswith ('image/' ):
114- checked_urls [path ] = f'unexpected content type ({ content_type } )'
115- return f'unexpected content type ({ content_type } )'
116-
117- checked_urls [path ] = None # No issues
130+ checked_urls [path ] = "OK"
118131 return None
119132
120133 except requests .exceptions .Timeout :
121- print (f"⚠️ Timeout while accessing { path } --> Retrying ({ attempt + 1 } /{ MAX_RETRIES } )..." )
134+ print (
135+ f"⚠️ Timeout while accessing { path } --> Retrying ({ attempt + 1 } /{ MAX_RETRIES } )..."
136+ )
122137 time .sleep (RETRY_DELAY )
123138 except requests .exceptions .RequestException as e :
124- print (f"⚠️ Error while accessing { path } --> { str (e )} --> Retrying ({ attempt + 1 } /{ MAX_RETRIES } )..." )
139+ print (
140+ f"⚠️ Error while accessing { path } --> { str (e )} --> Retrying ({ attempt + 1 } /{ MAX_RETRIES } )..."
141+ )
125142 time .sleep (RETRY_DELAY )
126143
127- checked_urls [path ] = 'timeout'
128- return 'timeout'
144+ checked_urls [path ] = "timeout"
145+ return "timeout"
146+
129147
130148def validate_images (md_files , disallowed_domains ):
131149 """Validate all image references in markdown files."""
@@ -136,38 +154,42 @@ def validate_images(md_files, disallowed_domains):
136154 image_paths = extract_image_paths_with_line_numbers (md_file )
137155 for line_num , path in image_paths :
138156 if path .startswith ("http://" ) or path .startswith ("https://" ):
139- # Checking external image
140157 print (f"🔵 Checking external image { path } " )
141158
142159 if is_disallowed_url (path , disallowed_domains ):
143- description = ' disallowed domain'
160+ description = " disallowed domain"
144161 issues .append ((lang , md_file , line_num , path , description ))
145162 print (f"❌ Disallowed domain detected { path } " )
146163 continue
147164
148- if path in checked_urls :
149- error_desc = checked_urls [path ]
150- else :
151- error_desc = validate_external_image (path , checked_urls )
165+ error_desc = validate_external_image (path , checked_urls )
152166
153167 if error_desc :
154- description = error_desc if error_desc != ' timeout' else ' timeout'
168+ description = error_desc if error_desc != " timeout" else " timeout"
155169 issues .append ((lang , md_file , line_num , path , description ))
156170
157171 else :
158- abs_path = os .path .normpath (os .path .join (os .path .dirname (md_file ), path ))
172+ abs_path = os .path .normpath (
173+ os .path .join (os .path .dirname (md_file ), path )
174+ )
159175 if not os .path .exists (abs_path ):
160- description = ' local image missing'
176+ description = " local image missing"
161177 issues .append ((lang , md_file , line_num , path , description ))
162178
163179 return issues
164180
181+
165182if __name__ == "__main__" :
166- print ("🔍 Scanning markdown files for image references in: " + ", " .join ([f"{ lang } ({ dir } )" for lang , dir in DOCS_DIRS .items ()]))
183+ print (
184+ "🔍 Scanning markdown files for image references in: "
185+ + ", " .join ([f"{ lang } ({ dir } )" for lang , dir in DOCS_DIRS .items ()])
186+ )
167187
168188 disallowed_domains = load_disallowed_domains (DISALLOWED_DOMAINS_FILE )
169189 if disallowed_domains :
170- print (f"ℹ️ Loaded { len (disallowed_domains )} disallowed domains from { DISALLOWED_DOMAINS_FILE } " )
190+ print (
191+ f"ℹ️ Loaded { len (disallowed_domains )} disallowed domains from { DISALLOWED_DOMAINS_FILE } "
192+ )
171193
172194 md_files = find_markdown_files (DOCS_DIRS )
173195 if not md_files :
@@ -181,7 +203,9 @@ def validate_images(md_files, disallowed_domains):
181203 if issues :
182204 print ("\n ❌ Image issues found:" )
183205 for lang , md_file , line_num , img_path , description in issues :
184- print (f" [{ lang } ] In file '{ md_file } ' at line { line_num } : Image '{ img_path } ' failed ({ description } )" )
206+ print (
207+ f" [{ lang } ] In file '{ md_file } ' at line { line_num } : Image '{ img_path } ' failed ({ description } )"
208+ )
185209 print (f"\n ❌ Validation failed: { len (issues )} issue(s) found.\n " )
186210 sys .exit (1 )
187211 else :
0 commit comments