@@ -49,34 +49,48 @@ def extract_urls(file_path):
4949 return urls
5050
5151def check_url (url ):
52- """Check if a URL is reachable with retries and logging."""
5352 headers = {
54- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
53+ "User-Agent" : (
54+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
55+ "AppleWebKit/537.36 (KHTML, like Gecko) "
56+ "Chrome/91.0.4472.124 Safari/537.36"
57+ ),
58+ "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" ,
59+ "Accept-Language" : "en-US,en;q=0.5" ,
60+ "Referer" : "https://www.google.com"
5561 }
5662
63+ last_error = ""
5764 for attempt in range (1 , max_retries + 1 ):
5865 try :
5966 response = requests .get (url , headers = headers , timeout = 5 , allow_redirects = True )
60-
61- if response .status_code == 200 :
67+ status = response .status_code
68+
69+ if status == 200 :
6270 return f"✅ { url } is reachable (Attempt { attempt } )"
63- elif response . status_code == 429 :
71+ elif status == 429 :
6472 return f"✅ { url } is rate-limited (429), considering it reachable."
65- elif response . status_code in [ 403 , 404 ] :
66- return f"❌ { url } returned status { response . status_code } . It may be blocking bots. (Attempt { attempt } )"
73+ elif status in ( 404 , 410 ) :
74+ return f"❌ { url } returned { status } (Attempt { attempt } )"
6775 else :
68- return f"❌ { url } returned status { response .status_code } (Attempt { attempt } )"
69- except requests .exceptions .ConnectionError :
70- return f"❌ { url } is unreachable (Connection Error) (Attempt { attempt } )"
76+ last_error = f"{ url } returned status { status } (Attempt { attempt } ), final URL: { response .url } "
77+
78+ except requests .exceptions .SSLError as e :
79+ last_error = f"{ url } SSL Error: { e } "
80+ except requests .exceptions .ConnectionError as e :
81+ last_error = f"{ url } Connection Error: { e } "
7182 except requests .exceptions .Timeout :
72- return f"❌ { url } is unreachable ( Timeout) (Attempt { attempt } ) "
83+ last_error = f" { url } Timeout"
7384 except requests .exceptions .RequestException as e :
74- return f"❌ { url } failed due to { e } (Attempt { attempt } )"
75-
76- # Wait before retrying
85+ last_error = f"{ url } failed: { e } "
86+
7787 time .sleep (random .uniform (1 , 3 ))
7888
79- return f"❌ { url } is unreachable after { max_retries } attempts."
89+ # Final classification
90+ if any (code in last_error for code in ["returned 404" , "returned 410" ]):
91+ return f"❌ { last_error } "
92+ else :
93+ return f"⚠️ { url } is potentially reachable but failed after { max_retries } attempts. Last error: { last_error } "
8094
8195def log_results (results ):
8296 """Log results to a file."""
0 commit comments