2323# - HTML anchors: <a href="url">
2424#
2525# Each link is checked for:
26- # - HTTP errors: GET each URL, flag 4xx responses and soft-404 pages.
26+ # - HTTP errors: GET each URL, flag 4xx responses and soft-404 pages. A 401/403
27+ # is re-checked with a browser-impersonating client (curl_cffi) first, since
28+ # many sites bot-block plain HTTP clients on otherwise-valid links. If that
29+ # re-check reveals a real 404, it's escalated to a failing error (so a dead
30+ # link can't hide behind a 403); if it still can't verify, it stays a warning.
2731# - Missing sections: a #fragment into the QuantConnect docs is validated against
2832# the docs pages on disk (this script runs inside the Documentation repo),
2933# because the HTTP check can't see the fragment - the server returns 200
5458from pathlib import Path
5559
5660import aiohttp
61+ from curl_cffi .requests import AsyncSession
5762
5863from doc_anchors import build_file_index , check_deprecated_path , check_section_anchor
5964
@@ -163,11 +168,42 @@ def _is_external(url: str) -> bool:
163168 return url .startswith ("http://" ) or url .startswith ("https://" )
164169
165170
171+ async def _browser_recheck (url : str ) -> str :
172+ """Re-check a 401/403 with a browser-impersonating client (Chrome TLS fingerprint).
173+
174+ Many sites (FINRA, Coinbase, Bitfinex, CoinAPI, ...) serve 401/403 to plain
175+ HTTP clients based on TLS/header fingerprinting even though the link is valid;
176+ curl_cffi mimics a real browser and gets through. Returns:
177+ - "ok" : a real 2xx (or 429 - server is live, just rate-limiting)
178+ - "broken" : a definitive 404/410 the block was hiding (a genuinely dead link)
179+ - "blocked" : still can't verify (e.g. a hard WAF) - report but don't fail
180+ """
181+ for attempt in range (3 ):
182+ try :
183+ async with AsyncSession () as s :
184+ r = await s .get (url , impersonate = "chrome" , timeout = 30 , allow_redirects = True )
185+ code = r .status_code
186+ if 200 <= code < 300 :
187+ return "broken" if str (r .url ).rstrip ("/" ).endswith ("/404" ) else "ok"
188+ if code in (404 , 410 ):
189+ return "broken" # the block was hiding a genuinely dead link
190+ if code == 429 :
191+ return "ok" # rate-limited => server is live, link exists
192+ # 401/403/5xx: transient block or rate-limit - back off and retry
193+ except Exception :
194+ pass
195+ await asyncio .sleep (2 * (attempt + 1 )) # 2s, 4s
196+ return "blocked"
197+
198+
166199async def check_url (session : aiohttp .ClientSession , semaphore : asyncio .Semaphore ,
167200 url : str , files : list [Path ], repo_dir : Path ,
168201 findings : list [Finding ], broken_pages : set [str ]):
169202 """Check a single external URL for errors. Records page-level breakage (404 /
170- soft-404) in broken_pages so a redundant on-disk finding isn't also reported."""
203+ soft-404) in broken_pages so a redundant on-disk finding isn't also reported.
204+
205+ A 401/403 is re-checked with a browser-impersonating client before warning, so
206+ bot-blocked-but-valid links don't generate noise."""
171207 async with semaphore :
172208 try :
173209 async with session .get (
@@ -177,10 +213,20 @@ async def check_url(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore
177213 match resp .status :
178214 case 400 :
179215 findings .append (_finding ("400" , "400 Bad Request" , url , files , repo_dir ))
180- case 401 :
181- findings .append (_finding ("401" , "401 Unauthorized" , url , files , repo_dir ))
182- case 403 :
183- findings .append (_finding ("403" , "403 Forbidden" , url , files , repo_dir ))
216+ case 401 | 403 :
217+ # Likely bot/WAF blocking - re-check with a browser-like client.
218+ verdict = await _browser_recheck (url )
219+ if verdict == "broken" :
220+ # The block was hiding a genuinely dead link - fail on it.
221+ findings .append (_finding (
222+ "404" , "404 Not found (confirmed via browser re-check)" ,
223+ url , files , repo_dir ))
224+ broken_pages .add (url )
225+ elif verdict == "blocked" :
226+ cat = str (resp .status )
227+ msg = f"{ resp .status } { 'Unauthorized' if resp .status == 401 else 'Forbidden' } "
228+ findings .append (_finding (cat , msg , url , files , repo_dir ))
229+ # verdict == "ok" -> link is valid, no finding
184230 case 404 :
185231 findings .append (_finding ("404" , "404 Not found" , url , files , repo_dir ))
186232 broken_pages .add (url )
0 commit comments