Skip to content

Commit baed613

Browse files
Merge pull request #718 from NFDI4BIOIMAGE/improve-url-validity
Improve url validity script
2 parents 00c6cf3 + 790ed9f commit baed613

3 files changed

Lines changed: 32 additions & 18 deletions

File tree

.github/workflows/auto-add-newsletter-links.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: Generate Newsletter Links
22

33
on:
44
schedule:
5-
- cron: '0 0 4 * *' # Runs at midnight (UTC) on the 4st day of every month
5+
- cron: '0 0 4 9,5,3 *' # Runs at midnight on the 4th of March, May and September
66
# Runs at 8:00 AM UTC on the first Monday of every 2nd month
77
#- cron: "0 8 * 1,3,5,7,9,11 1"
88

.github/workflows/auto-url-validity-check.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ name: URL Validity Check
22

33
on:
44
schedule:
5-
- cron: '0 0 4 * *' # Runs at midnight (UTC) on the 4st day of every month
5+
- cron: '0 0 4 9,5,3 *' # Runs at midnight on the 4th of March and September
66
workflow_dispatch: # Allows manual trigger
77

88
jobs:
@@ -32,7 +32,7 @@ jobs:
3232
- name: Extract Failing URLs
3333
run: |
3434
echo "Please be so kind to double-check the validity of the following urls:" > failing_urls.md
35-
grep -E '❌' url_check_results.log >> failing_urls.md || echo "No issues found!" >> failing_urls.md
35+
grep -E '❌|⚠️' url_check_results.log >> failing_urls.md || echo "No issues found!" >> failing_urls.md
3636
3737
- name: Create GitHub Issue
3838
if: success()

scripts/auto-url-validity-check.py

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -49,34 +49,48 @@ def extract_urls(file_path):
4949
return urls
5050

5151
def check_url(url):
52-
"""Check if a URL is reachable with retries and logging."""
5352
headers = {
54-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
53+
"User-Agent": (
54+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
55+
"AppleWebKit/537.36 (KHTML, like Gecko) "
56+
"Chrome/91.0.4472.124 Safari/537.36"
57+
),
58+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
59+
"Accept-Language": "en-US,en;q=0.5",
60+
"Referer": "https://www.google.com"
5561
}
5662

63+
last_error = ""
5764
for attempt in range(1, max_retries + 1):
5865
try:
5966
response = requests.get(url, headers=headers, timeout=5, allow_redirects=True)
60-
61-
if response.status_code == 200:
67+
status = response.status_code
68+
69+
if status == 200:
6270
return f"✅ {url} is reachable (Attempt {attempt})"
63-
elif response.status_code == 429:
71+
elif status == 429:
6472
return f"✅ {url} is rate-limited (429), considering it reachable."
65-
elif response.status_code in [403, 404]:
66-
return f"❌ {url} returned status {response.status_code}. It may be blocking bots. (Attempt {attempt})"
73+
elif status in (404, 410):
74+
return f"❌ {url} returned {status} (Attempt {attempt})"
6775
else:
68-
return f"❌ {url} returned status {response.status_code} (Attempt {attempt})"
69-
except requests.exceptions.ConnectionError:
70-
return f"❌ {url} is unreachable (Connection Error) (Attempt {attempt})"
76+
last_error = f"{url} returned status {status} (Attempt {attempt}), final URL: {response.url}"
77+
78+
except requests.exceptions.SSLError as e:
79+
last_error = f"{url} SSL Error: {e}"
80+
except requests.exceptions.ConnectionError as e:
81+
last_error = f"{url} Connection Error: {e}"
7182
except requests.exceptions.Timeout:
72-
return f"❌ {url} is unreachable (Timeout) (Attempt {attempt})"
83+
last_error = f"{url} Timeout"
7384
except requests.exceptions.RequestException as e:
74-
return f"❌ {url} failed due to {e} (Attempt {attempt})"
75-
76-
# Wait before retrying
85+
last_error = f"{url} failed: {e}"
86+
7787
time.sleep(random.uniform(1, 3))
7888

79-
return f"❌ {url} is unreachable after {max_retries} attempts."
89+
# Final classification
90+
if any(code in last_error for code in ["returned 404", "returned 410"]):
91+
return f"❌ {last_error}"
92+
else:
93+
return f"⚠️ {url} is potentially reachable but failed after {max_retries} attempts. Last error: {last_error}"
8094

8195
def log_results(results):
8296
"""Log results to a file."""

0 commit comments

Comments
 (0)