|
5 | 5 | import time |
6 | 6 | import datetime |
7 | 7 | import random |
| 8 | +import os |
8 | 9 |
|
9 | 10 | # File paths |
10 | 11 | yaml_file_path = "resources/nfdi4bioimage.yml" |
|
15 | 16 | #create filename |
16 | 17 | log_file = f'scripts/url_validity_check/{date}_url_check_results.log' |
17 | 18 |
|
| 19 | +# Ensure log directory exists |
| 20 | +os.makedirs(os.path.dirname(log_file), exist_ok=True) |
| 21 | + |
18 | 22 | # Max retries for failed requests |
19 | 23 | max_retries = 3 |
| 24 | +base_backoff = 2 |
20 | 25 |
|
21 | 26 | def main(): |
22 | 27 | """ |
@@ -49,48 +54,47 @@ def extract_urls(file_path): |
49 | 54 | return urls |
50 | 55 |
|
51 | 56 | def check_url(url): |
| 57 | + user_agents = [ |
| 58 | + "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", |
| 59 | + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)", |
| 60 | + "Mozilla/5.0 (X11; Linux x86_64)", |
| 61 | + ] |
| 62 | + |
52 | 63 | headers = { |
53 | | - "User-Agent": ( |
54 | | - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
55 | | - "AppleWebKit/537.36 (KHTML, like Gecko) " |
56 | | - "Chrome/91.0.4472.124 Safari/537.36" |
57 | | - ), |
| 64 | + "User-Agent": random.choice(user_agents), |
58 | 65 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
59 | | - "Accept-Language": "en-US,en;q=0.5", |
60 | | - "Referer": "https://www.google.com" |
| 66 | + "Accept-Language": "en-US,en;q=0.9", |
| 67 | + "Referer": "https://www.google.com/", |
| 68 | + "DNT": "1", # Do Not Track |
| 69 | + "Connection": "keep-alive", |
| 70 | + "Upgrade-Insecure-Requests": "1", |
| 71 | + "Cache-Control": "no-cache" |
61 | 72 | } |
62 | 73 |
|
63 | 74 | last_error = "" |
64 | 75 | for attempt in range(1, max_retries + 1): |
65 | 76 | try: |
66 | | - response = requests.get(url, headers=headers, timeout=5, allow_redirects=True) |
| 77 | + response = requests.get(url, headers=headers, timeout=15, allow_redirects=True) |
67 | 78 | status = response.status_code |
68 | 79 |
|
69 | 80 | if status == 200: |
70 | 81 | return f"✅ {url} is reachable (Attempt {attempt})" |
71 | 82 | elif status == 429: |
72 | | - return f"✅ {url} is rate-limited (429), considering it reachable." |
| 83 | + return f"⚠️ {url} is rate-limited (429), considering it reachable." |
73 | 84 | elif status in (404, 410): |
74 | 85 | return f"❌ {url} returned {status} (Attempt {attempt})" |
| 86 | + elif status in (403, 400): |
| 87 | + return f"⚠️ {url} returned status {status}. It may be blocking bots. (Attempt {attempt})" |
75 | 88 | else: |
76 | 89 | last_error = f"{url} returned status {status} (Attempt {attempt}), final URL: {response.url}" |
77 | 90 |
|
78 | | - except requests.exceptions.SSLError as e: |
79 | | - last_error = f"{url} SSL Error: {e}" |
80 | | - except requests.exceptions.ConnectionError as e: |
81 | | - last_error = f"{url} Connection Error: {e}" |
82 | | - except requests.exceptions.Timeout: |
83 | | - last_error = f"{url} Timeout" |
84 | 91 | except requests.exceptions.RequestException as e: |
85 | | - last_error = f"{url} failed: {e}" |
| 92 | + last_error = f"{url} failed: {repr(e)}" |
86 | 93 |
|
87 | | - time.sleep(random.uniform(1, 3)) |
| 94 | + sleep_time = base_backoff ** attempt + random.uniform(0, 1) |
| 95 | + time.sleep(sleep_time) |
88 | 96 |
|
89 | | - # Final classification |
90 | | - if any(code in last_error for code in ["returned 404", "returned 410"]): |
91 | | - return f"❌ {last_error}" |
92 | | - else: |
93 | | - return f"⚠️ {url} is potentially reachable but failed after {max_retries} attempts. Last error: {last_error}" |
| 97 | + return f"⚠️ {url} is potentially reachable but failed after {max_retries} attempts. Last error: {last_error}" |
94 | 98 |
|
95 | 99 | def log_results(results): |
96 | 100 | """Log results to a file.""" |
|
0 commit comments