Skip to content

Commit 506870f

Browse files
authored
Merge pull request #2558 from DerekMelchin/bug-1575-fix-403-warnings
Fix 403 link warnings
2 parents 85f8b0c + 71345e6 commit 506870f

2 files changed

Lines changed: 53 additions & 7 deletions

File tree

.github/workflows/external_url_check_workflow.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ jobs:
4848
python-version: "3.11"
4949

5050
- name: Install dependencies
51-
run: pip install aiohttp==3.11.14
51+
run: pip install aiohttp==3.11.14 curl_cffi==0.13.0
5252

5353
- name: Clone target repos
5454
env:

external_url_check.py

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,11 @@
2323
# - HTML anchors: <a href="url">
2424
#
2525
# Each link is checked for:
26-
# - HTTP errors: GET each URL, flag 4xx responses and soft-404 pages.
26+
# - HTTP errors: GET each URL, flag 4xx responses and soft-404 pages. A 401/403
27+
# is re-checked with a browser-impersonating client (curl_cffi) first, since
28+
# many sites bot-block plain HTTP clients on otherwise-valid links. If that
29+
# re-check reveals a real 404, it's escalated to a failing error (so a dead
30+
# link can't hide behind a 403); if it still can't verify, it stays a warning.
2731
# - Missing sections: a #fragment into the QuantConnect docs is validated against
2832
# the docs pages on disk (this script runs inside the Documentation repo),
2933
# because the HTTP check can't see the fragment - the server returns 200
@@ -54,6 +58,7 @@
5458
from pathlib import Path
5559

5660
import aiohttp
61+
from curl_cffi.requests import AsyncSession
5762

5863
from doc_anchors import build_file_index, check_deprecated_path, check_section_anchor
5964

@@ -163,11 +168,42 @@ def _is_external(url: str) -> bool:
163168
return url.startswith("http://") or url.startswith("https://")
164169

165170

171+
async def _browser_recheck(url: str) -> str:
172+
"""Re-check a 401/403 with a browser-impersonating client (Chrome TLS fingerprint).
173+
174+
Many sites (FINRA, Coinbase, Bitfinex, CoinAPI, ...) serve 401/403 to plain
175+
HTTP clients based on TLS/header fingerprinting even though the link is valid;
176+
curl_cffi mimics a real browser and gets through. Returns:
177+
- "ok" : a real 2xx (or 429 - server is live, just rate-limiting)
178+
- "broken" : a definitive 404/410 the block was hiding (a genuinely dead link)
179+
- "blocked" : still can't verify (e.g. a hard WAF) - report but don't fail
180+
"""
181+
for attempt in range(3):
182+
try:
183+
async with AsyncSession() as s:
184+
r = await s.get(url, impersonate="chrome", timeout=30, allow_redirects=True)
185+
code = r.status_code
186+
if 200 <= code < 300:
187+
return "broken" if str(r.url).rstrip("/").endswith("/404") else "ok"
188+
if code in (404, 410):
189+
return "broken" # the block was hiding a genuinely dead link
190+
if code == 429:
191+
return "ok" # rate-limited => server is live, link exists
192+
# 401/403/5xx: transient block or rate-limit - back off and retry
193+
except Exception:
194+
pass
195+
await asyncio.sleep(2 * (attempt + 1)) # 2s, 4s
196+
return "blocked"
197+
198+
166199
async def check_url(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore,
167200
url: str, files: list[Path], repo_dir: Path,
168201
findings: list[Finding], broken_pages: set[str]):
169202
"""Check a single external URL for errors. Records page-level breakage (404 /
170-
soft-404) in broken_pages so a redundant on-disk finding isn't also reported."""
203+
soft-404) in broken_pages so a redundant on-disk finding isn't also reported.
204+
205+
A 401/403 is re-checked with a browser-impersonating client before warning, so
206+
bot-blocked-but-valid links don't generate noise."""
171207
async with semaphore:
172208
try:
173209
async with session.get(
@@ -177,10 +213,20 @@ async def check_url(session: aiohttp.ClientSession, semaphore: asyncio.Semaphore
177213
match resp.status:
178214
case 400:
179215
findings.append(_finding("400", "400 Bad Request", url, files, repo_dir))
180-
case 401:
181-
findings.append(_finding("401", "401 Unauthorized", url, files, repo_dir))
182-
case 403:
183-
findings.append(_finding("403", "403 Forbidden", url, files, repo_dir))
216+
case 401 | 403:
217+
# Likely bot/WAF blocking - re-check with a browser-like client.
218+
verdict = await _browser_recheck(url)
219+
if verdict == "broken":
220+
# The block was hiding a genuinely dead link - fail on it.
221+
findings.append(_finding(
222+
"404", "404 Not found (confirmed via browser re-check)",
223+
url, files, repo_dir))
224+
broken_pages.add(url)
225+
elif verdict == "blocked":
226+
cat = str(resp.status)
227+
msg = f"{resp.status} {'Unauthorized' if resp.status == 401 else 'Forbidden'}"
228+
findings.append(_finding(cat, msg, url, files, repo_dir))
229+
# verdict == "ok" -> link is valid, no finding
184230
case 404:
185231
findings.append(_finding("404", "404 Not found", url, files, repo_dir))
186232
broken_pages.add(url)

0 commit comments

Comments
 (0)