Skip to content

Commit 05bd251

Browse files
Merge pull request #746 from NFDI4BIOIMAGE/url-validity-again
Url validity again
2 parents ced1770 + d7f2e45 commit 05bd251

2 files changed

Lines changed: 28 additions & 24 deletions

File tree

.github/workflows/auto-url-validity-check.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: URL Validity Check
22

33
on:
44
schedule:
5-
- cron: '0 0 4 9,5,3 *' # Runs at midnight on the 4th of March and September
5+
- cron: '0 0 4 6,12 *' # Runs at midnight on the 4th of June and December
66
workflow_dispatch: # Allows manual trigger
77

88
jobs:
@@ -32,7 +32,7 @@ jobs:
3232
- name: Extract Failing URLs
3333
run: |
3434
echo "Please be so kind to double-check the validity of the following urls:" > failing_urls.md
35-
grep -E '❌|⚠️' url_check_results.log >> failing_urls.md || echo "No issues found!" >> failing_urls.md
35+
grep -E '❌' url_check_results.log >> failing_urls.md || echo "No issues found!" >> failing_urls.md
3636
3737
- name: Create GitHub Issue
3838
if: success()

scripts/auto-url-validity-check.py

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import time
66
import datetime
77
import random
8+
import os
89

910
# File paths
1011
yaml_file_path = "resources/nfdi4bioimage.yml"
@@ -15,8 +16,12 @@
1516
#create filename
1617
log_file = f'scripts/url_validity_check/{date}_url_check_results.log'
1718

19+
# Ensure log directory exists
20+
os.makedirs(os.path.dirname(log_file), exist_ok=True)
21+
1822
# Max retries for failed requests
1923
max_retries = 3
24+
base_backoff = 2
2025

2126
def main():
2227
"""
@@ -49,48 +54,47 @@ def extract_urls(file_path):
4954
return urls
5055

5156
def check_url(url):
57+
user_agents = [
58+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
59+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
60+
"Mozilla/5.0 (X11; Linux x86_64)",
61+
]
62+
5263
headers = {
53-
"User-Agent": (
54-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
55-
"AppleWebKit/537.36 (KHTML, like Gecko) "
56-
"Chrome/91.0.4472.124 Safari/537.36"
57-
),
64+
"User-Agent": random.choice(user_agents),
5865
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
59-
"Accept-Language": "en-US,en;q=0.5",
60-
"Referer": "https://www.google.com"
66+
"Accept-Language": "en-US,en;q=0.9",
67+
"Referer": "https://www.google.com/",
68+
"DNT": "1", # Do Not Track
69+
"Connection": "keep-alive",
70+
"Upgrade-Insecure-Requests": "1",
71+
"Cache-Control": "no-cache"
6172
}
6273

6374
last_error = ""
6475
for attempt in range(1, max_retries + 1):
6576
try:
66-
response = requests.get(url, headers=headers, timeout=5, allow_redirects=True)
77+
response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
6778
status = response.status_code
6879

6980
if status == 200:
7081
return f"✅ {url} is reachable (Attempt {attempt})"
7182
elif status == 429:
72-
return f" {url} is rate-limited (429), considering it reachable."
83+
return f"⚠️ {url} is rate-limited (429), considering it reachable."
7384
elif status in (404, 410):
7485
return f"❌ {url} returned {status} (Attempt {attempt})"
86+
elif status in (403, 400):
87+
return f"⚠️ {url} returned status {status}. It may be blocking bots. (Attempt {attempt})"
7588
else:
7689
last_error = f"{url} returned status {status} (Attempt {attempt}), final URL: {response.url}"
7790

78-
except requests.exceptions.SSLError as e:
79-
last_error = f"{url} SSL Error: {e}"
80-
except requests.exceptions.ConnectionError as e:
81-
last_error = f"{url} Connection Error: {e}"
82-
except requests.exceptions.Timeout:
83-
last_error = f"{url} Timeout"
8491
except requests.exceptions.RequestException as e:
85-
last_error = f"{url} failed: {e}"
92+
last_error = f"{url} failed: {repr(e)}"
8693

87-
time.sleep(random.uniform(1, 3))
94+
sleep_time = base_backoff ** attempt + random.uniform(0, 1)
95+
time.sleep(sleep_time)
8896

89-
# Final classification
90-
if any(code in last_error for code in ["returned 404", "returned 410"]):
91-
return f"❌ {last_error}"
92-
else:
93-
return f"⚠️ {url} is potentially reachable but failed after {max_retries} attempts. Last error: {last_error}"
97+
return f"⚠️ {url} is potentially reachable but failed after {max_retries} attempts. Last error: {last_error}"
9498

9599
def log_results(results):
96100
"""Log results to a file."""

0 commit comments

Comments
 (0)