Skip to content

Commit 064eaba

Browse files
committed
Optimierung
Update validate_internal_external_links.py
1 parent 267495a commit 064eaba

5 files changed

Lines changed: 375 additions & 270 deletions

File tree

.github/ci/ignore_urls.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Known problematic URLs (block bots, rate limit or unreliable)
22

3-
https://investor.apple.com/dividend-history/default.aspx
4-
https://www.investing.com/equities/amazon-com-inc
5-
https://group.mercedes-benz.com/documents/investors/annual-meeting/daimler-ir-egm-2021-spinoffhivedownreport.pdf
6-
https://www.coingecko.com/
3+
investor.apple.com
4+
www.investing.com
5+
group.mercedes-benz.com
6+
www.coingecko.com

.github/scripts/validate_images.py

Lines changed: 88 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -3,82 +3,91 @@
33
import sys
44
import requests
55
import time
6+
import random
67

78
# Define the directories and their language labels
89
DOCS_DIRS = {
9-
'de': os.path.join('docs', 'de'),
10-
'en': os.path.join('docs', 'en'),
10+
"de": os.path.join("docs", "de"),
11+
"en": os.path.join("docs", "en"),
1112
}
1213

1314
# Path to disallowed domains file
14-
DISALLOWED_DOMAINS_FILE = os.path.join('.github', 'ci', 'disallowed_image_domains.txt')
15+
DISALLOWED_DOMAINS_FILE = os.path.join(".github", "ci", "disallowed_image_domains.txt")
1516

16-
# Timeouts for external image check (in seconds)
17+
# Timeouts for external image check
1718
HTTP_TIMEOUT = 3
18-
19-
# Retry settings
2019
MAX_RETRIES = 3
21-
RETRY_DELAY = 3 # seconds
22-
23-
# Custom headers to simulate a normal web browser request
24-
HEADERS = {
25-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
26-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
27-
'Accept-Encoding': 'gzip, deflate, br',
28-
'Accept-Language': 'en-US,en;q=0.9,de;q=0.8',
29-
'Connection': 'keep-alive',
30-
'Upgrade-Insecure-Requests': '1',
31-
'DNT': '1', # Do Not Track header
32-
'Cache-Control': 'max-age=0',
33-
'Referer': 'https://www.google.com/', # Adding a generic referer
34-
'X-Requested-With': 'XMLHttpRequest',
35-
'TE': 'Trailers',
20+
RETRY_DELAY = 3
21+
22+
# Custom headers to simulate a normal web browser request (added more headers to bypass restrictions like Cloudflare)
23+
SESSION_HEADERS = {
24+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.50 Safari/537.36",
25+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
26+
"Accept-Encoding": "gzip, deflate, br",
27+
"Accept-Language": "en-US,en;q=0.9,de;q=0.8",
28+
"Connection": "keep-alive",
29+
"Upgrade-Insecure-Requests": "1",
30+
"DNT": "1",
31+
"Cache-Control": "max-age=0",
32+
"X-Requested-With": "XMLHttpRequest",
33+
"TE": "Trailers",
34+
"Referer": "https://www.portfolio-performance.info/",
35+
"Origin": "https://www.portfolio-performance.info",
3636
}
3737

38+
# Reusable session for all requests
39+
session = requests.Session()
40+
session.headers.update(SESSION_HEADERS)
41+
42+
3843
def load_disallowed_domains(file_path):
3944
"""Load disallowed domains from a text file into a set."""
4045
disallowed_domains = set()
4146
if os.path.exists(file_path):
42-
with open(file_path, 'r', encoding='utf-8') as f:
47+
with open(file_path, "r", encoding="utf-8") as f:
4348
for line in f:
4449
line = line.strip()
4550
if line and not line.startswith("#"):
4651
disallowed_domains.add(line.lower())
4752
return disallowed_domains
4853

54+
4955
def is_disallowed_url(path, disallowed_domains):
5056
"""Return True if the URL contains any disallowed domain."""
5157
return any(domain in path.lower() for domain in disallowed_domains)
5258

59+
5360
def find_markdown_files(base_dirs):
5461
"""Recursively find all markdown files in given directories."""
5562
md_files = []
5663
for lang, base_dir in base_dirs.items():
5764
if not os.path.exists(base_dir):
58-
print(f"⚠️ Warning: Directory '{base_dir}' [{lang}] does not exist. Skipping.")
65+
print(
66+
f"⚠️ Warning: Directory '{base_dir}' [{lang}] does not exist. Skipping."
67+
)
5968
continue
6069
for root, dirs, files in os.walk(base_dir):
6170
for file in files:
6271
if file.endswith(".md"):
6372
md_files.append((lang, os.path.join(root, file)))
6473
return md_files
6574

75+
6676
def extract_image_paths_with_line_numbers(md_file):
6777
"""Extract all image references from a markdown file with their line numbers."""
6878
image_paths = []
69-
pattern = re.compile(r'!\[[^\]]*\]\(([^)]+)\)', re.MULTILINE | re.IGNORECASE)
79+
pattern = re.compile(r"!\[[^\]]*\]\(([^)]+)\)", re.MULTILINE | re.IGNORECASE)
7080

71-
with open(md_file, 'r', encoding='utf-8') as f:
81+
with open(md_file, "r", encoding="utf-8") as f:
7282
for line_num, line in enumerate(f, 1):
7383
matches = pattern.findall(line)
7484
for path in matches:
7585
image_paths.append((line_num, path.strip()))
7686
return image_paths
7787

88+
7889
def validate_external_image(path, checked_urls):
7990
"""Validate an external image URL by checking availability and content type."""
80-
session = requests.Session()
81-
session.headers.update(HEADERS)
8291

8392
if path in checked_urls:
8493
return checked_urls[path]
@@ -87,45 +96,54 @@ def validate_external_image(path, checked_urls):
8796
try:
8897
resp = session.head(path, timeout=HTTP_TIMEOUT, allow_redirects=True)
8998

99+
if resp.status_code in (403, 405):
100+
print(
101+
f"⚠️ Warning: Access denied (HTTP {resp.status_code}) while accessing {path}."
102+
)
103+
checked_urls[path] = f"HTTP {resp.status_code}"
104+
return f"HTTP {resp.status_code}"
105+
106+
if "image/" not in resp.headers.get("Content-Type", "").lower():
107+
# Fallback to GET immediately if HEAD is blocked OR doesn't return content type
108+
resp = session.get(path, timeout=HTTP_TIMEOUT, allow_redirects=True)
109+
90110
if resp.status_code == 429:
91-
print(f"⚠️ Rate-limited (HTTP 429) while accessing {path} --> Retrying after {RETRY_DELAY}s...")
111+
print(
112+
f"⚠️ Warning: Rate-limited (HTTP 429) while accessing {path} --> Retrying after {RETRY_DELAY}s..."
113+
)
92114
time.sleep(RETRY_DELAY)
93115
continue
94-
elif resp.status_code == 403:
95-
print(f"⚠️ Access forbidden (HTTP 403) for {path}")
96-
checked_urls[path] = 'HTTP 403'
97-
return 'HTTP 403'
98116
elif resp.status_code == 404:
99-
print(f"❌ Error while accessing {path} --> HTTP 404 (Not Found)")
100-
checked_urls[path] = 'HTTP 404'
101-
return 'HTTP 404'
117+
print(f"❌ Error: HTTP 404 (Not Found) while accessing {path}.")
118+
checked_urls[path] = "HTTP 404"
119+
return "HTTP 404"
102120
elif resp.status_code >= 400:
103-
print(f"⚠️ Error while accessing {path} --> HTTP {resp.status_code}")
104-
checked_urls[path] = f'HTTP {resp.status_code}'
105-
return f'HTTP {resp.status_code}'
121+
print(f"⚠️ Error: HTTP {resp.status_code} while accessing {path}.")
122+
checked_urls[path] = f"HTTP {resp.status_code}"
123+
return f"HTTP {resp.status_code}"
106124

107-
# Fallback to GET if HEAD fails with suspicious content-type
108-
content_type = resp.headers.get('Content-Type', '').lower()
109-
if not content_type.startswith('image/'):
110-
resp = session.get(path, timeout=HTTP_TIMEOUT, allow_redirects=True)
111-
content_type = resp.headers.get('Content-Type', '').lower()
125+
content_type = resp.headers.get("Content-Type", "").lower()
126+
if not content_type.startswith("image/"):
127+
checked_urls[path] = f"unexpected content type ({content_type})"
128+
return f"unexpected content type ({content_type})"
112129

113-
if not content_type.startswith('image/'):
114-
checked_urls[path] = f'unexpected content type ({content_type})'
115-
return f'unexpected content type ({content_type})'
116-
117-
checked_urls[path] = None # No issues
130+
checked_urls[path] = "OK"
118131
return None
119132

120133
except requests.exceptions.Timeout:
121-
print(f"⚠️ Timeout while accessing {path} --> Retrying ({attempt + 1}/{MAX_RETRIES})...")
134+
print(
135+
f"⚠️ Timeout while accessing {path} --> Retrying ({attempt + 1}/{MAX_RETRIES})..."
136+
)
122137
time.sleep(RETRY_DELAY)
123138
except requests.exceptions.RequestException as e:
124-
print(f"⚠️ Error while accessing {path} --> {str(e)} --> Retrying ({attempt + 1}/{MAX_RETRIES})...")
139+
print(
140+
f"⚠️ Error while accessing {path} --> {str(e)} --> Retrying ({attempt + 1}/{MAX_RETRIES})..."
141+
)
125142
time.sleep(RETRY_DELAY)
126143

127-
checked_urls[path] = 'timeout'
128-
return 'timeout'
144+
checked_urls[path] = "timeout"
145+
return "timeout"
146+
129147

130148
def validate_images(md_files, disallowed_domains):
131149
"""Validate all image references in markdown files."""
@@ -136,38 +154,42 @@ def validate_images(md_files, disallowed_domains):
136154
image_paths = extract_image_paths_with_line_numbers(md_file)
137155
for line_num, path in image_paths:
138156
if path.startswith("http://") or path.startswith("https://"):
139-
# Checking external image
140157
print(f"🔵 Checking external image {path}")
141158

142159
if is_disallowed_url(path, disallowed_domains):
143-
description = 'disallowed domain'
160+
description = "disallowed domain"
144161
issues.append((lang, md_file, line_num, path, description))
145162
print(f"❌ Disallowed domain detected {path}")
146163
continue
147164

148-
if path in checked_urls:
149-
error_desc = checked_urls[path]
150-
else:
151-
error_desc = validate_external_image(path, checked_urls)
165+
error_desc = validate_external_image(path, checked_urls)
152166

153167
if error_desc:
154-
description = error_desc if error_desc != 'timeout' else 'timeout'
168+
description = error_desc if error_desc != "timeout" else "timeout"
155169
issues.append((lang, md_file, line_num, path, description))
156170

157171
else:
158-
abs_path = os.path.normpath(os.path.join(os.path.dirname(md_file), path))
172+
abs_path = os.path.normpath(
173+
os.path.join(os.path.dirname(md_file), path)
174+
)
159175
if not os.path.exists(abs_path):
160-
description = 'local image missing'
176+
description = "local image missing"
161177
issues.append((lang, md_file, line_num, path, description))
162178

163179
return issues
164180

181+
165182
if __name__ == "__main__":
166-
print("🔍 Scanning markdown files for image references in: " + ", ".join([f"{lang} ({dir})" for lang, dir in DOCS_DIRS.items()]))
183+
print(
184+
"🔍 Scanning markdown files for image references in: "
185+
+ ", ".join([f"{lang} ({dir})" for lang, dir in DOCS_DIRS.items()])
186+
)
167187

168188
disallowed_domains = load_disallowed_domains(DISALLOWED_DOMAINS_FILE)
169189
if disallowed_domains:
170-
print(f"ℹ️ Loaded {len(disallowed_domains)} disallowed domains from {DISALLOWED_DOMAINS_FILE}")
190+
print(
191+
f"ℹ️ Loaded {len(disallowed_domains)} disallowed domains from {DISALLOWED_DOMAINS_FILE}"
192+
)
171193

172194
md_files = find_markdown_files(DOCS_DIRS)
173195
if not md_files:
@@ -181,7 +203,9 @@ def validate_images(md_files, disallowed_domains):
181203
if issues:
182204
print("\n❌ Image issues found:")
183205
for lang, md_file, line_num, img_path, description in issues:
184-
print(f" [{lang}] In file '{md_file}' at line {line_num}: Image '{img_path}' failed ({description})")
206+
print(
207+
f" [{lang}] In file '{md_file}' at line {line_num}: Image '{img_path}' failed ({description})"
208+
)
185209
print(f"\n❌ Validation failed: {len(issues)} issue(s) found.\n")
186210
sys.exit(1)
187211
else:

0 commit comments

Comments
 (0)