Skip to content

Commit 7955d98

Browse files
RyanAlbertsclaude
andcommitted
sec(phase-2): SSRF guard + source-URL host allowlist + raw-failure sanitize
Independent security audit (response to "double check the repo for any security concerns") found 3 HIGH and several MEDIUM/LOW issues. This PR addresses everything that affects code on main; daemon-branch findings (CORS regex, run_id traversal, log-file hygiene) will be addressed when the parked phase-3-pr18-daemon resumes. H1 — Source-URL allowlist bypass (researcher.py) Before: ``url.startswith(allowed_url.rstrip("/"))`` accepted "https://example.com.attacker.example/x" against "https://example.com". After: ``_looks_like_input_url`` parses the URL with ``urlparse`` and compares hosts (case-insensitive, www-stripped). The publish-time link verifier therefore can't be redirected at an attacker-owned look-alike domain by a hallucinated LLM citation. H2 — Server-side request forgery (new module ``safe_http.py``) Before: crawler, verifier, and link-checker would HEAD/GET any URL they were handed. A poisoned upstream value (yc-oss website field) or hallucinated LLM citation could direct fetches at ``http://169.254.169.254/`` (cloud metadata) or RFC1918 hosts. After: ``is_safe_external_url`` resolves the hostname via DNS and refuses any URL whose IPs include loopback / RFC1918 / link-local / multicast / reserved / unspecified addresses, plus the ``metadata.google.internal`` / ``metadata.goog`` hostnames. Multi- record DNS round-robins are blocked if ANY record is private (DNS-rebinding defense). RFC 2606 reserved TLDs (``.example``, ``.test``, ``.invalid``) are explicitly allowed since they cannot resolve to a real host. Verifier reports ``blocked`` distinctly from ``dead`` so the publish-gate failure log distinguishes "we refused" from "the server said no". M4 — Crawler chunk-truncation overshoot Before: each chunk appended in full before checking ``len(buf) >= max_bytes``. After: truncate each chunk to the remaining budget before extend. L2 — Raw failure log defense-in-depth Before: ``raw_failures.jsonl`` recorded raw model output verbatim. After: ``strip_pii`` runs over the raw payload before write. Tests: 47 new (38 safe_http, 4 verifier, 4 crawler SSRF, 4 source-URL host allowlist). 208 total passing. ruff + mypy strict clean. Daemon-branch items (CORS regex, run_id path-traversal, daemon log hygiene, hard-coded 127.0.0.1 bind) deferred to phase-3 resume. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent e2ae1d3 commit 7955d98

8 files changed

Lines changed: 424 additions & 8 deletions

File tree

src/ycai/crawler.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
import httpx
2929

30+
from ycai.safe_http import is_safe_external_url
3031
from ycai.sanitizer import strip_pii
3132

3233
log = logging.getLogger(__name__)
@@ -174,7 +175,14 @@ async def _fetch_one(
174175
max_bytes: int,
175176
timeout: float,
176177
) -> CrawledPage | None:
177-
"""Fetch one page. Return None on any error."""
178+
"""Fetch one page. Return None on any error.
179+
180+
Refuses internal targets (loopback, RFC1918, link-local, cloud
181+
metadata) before the network call.
182+
"""
183+
if not is_safe_external_url(url):
184+
log.debug("crawler refusing unsafe URL: %s", url)
185+
return None
178186
async with semaphore:
179187
try:
180188
async with client.stream("GET", url, timeout=timeout, follow_redirects=True) as resp:
@@ -186,7 +194,12 @@ async def _fetch_one(
186194
return None
187195
buf = bytearray()
188196
async for chunk in resp.aiter_bytes():
189-
buf.extend(chunk)
197+
# Truncate on overshoot so a single oversized chunk can't
198+
# blow past max_bytes by an arbitrary amount.
199+
remaining = max_bytes - len(buf)
200+
if remaining <= 0:
201+
break
202+
buf.extend(chunk[:remaining])
190203
if len(buf) >= max_bytes:
191204
break
192205
html = bytes(buf[:max_bytes]).decode("utf-8", errors="replace")
@@ -221,6 +234,8 @@ async def crawl_company(
221234
"""
222235
if not homepage or not homepage.startswith(("http://", "https://")):
223236
return CrawlResult(homepage=homepage, error="invalid-homepage-url")
237+
if not is_safe_external_url(homepage):
238+
return CrawlResult(homepage=homepage, error="unsafe-homepage-url")
224239

225240
owns_client = client is None
226241
client = client or httpx.AsyncClient(

src/ycai/researcher.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from datetime import UTC, datetime
2525
from pathlib import Path
2626
from typing import Any, Literal
27+
from urllib.parse import urlparse
2728

2829
from pydantic import ValidationError
2930

@@ -372,9 +373,33 @@ def _looks_like_input_url(url: str, company: RawCompany, extra_allowed: list[str
372373
373374
``extra_allowed`` is the list of URLs reached via the depth=1 crawl. The
374375
model is allowed to cite any of them.
376+
377+
Compares hosts (case-insensitive, www-stripped), not raw string prefixes.
378+
A naive ``startswith`` check on ``"https://example.com"`` accepts
379+
``"https://example.com.attacker.com/x"`` — the host check rejects that
380+
because ``example.com.attacker.com`` is not the same host as
381+
``example.com``.
375382
"""
376-
allowed = [company.website, company.url, *(extra_allowed or [])]
377-
return any(url.startswith(allowed_url.rstrip("/")) for allowed_url in allowed if allowed_url)
383+
384+
def _host(u: str) -> str:
385+
try:
386+
host = (urlparse(u).hostname or "").lower()
387+
except ValueError:
388+
return ""
389+
return host[4:] if host.startswith("www.") else host
390+
391+
try:
392+
parsed = urlparse(url)
393+
except ValueError:
394+
return False
395+
if parsed.scheme not in ("http", "https"):
396+
return False
397+
target_host = _host(url)
398+
if not target_host:
399+
return False
400+
allowed_hosts = {_host(u) for u in (company.website, company.url, *(extra_allowed or [])) if u}
401+
allowed_hosts.discard("")
402+
return target_host in allowed_hosts
378403

379404

380405
def _validate_sources(analysis: CompanyAnalysis, company: RawCompany, extra_allowed: list[str] | None = None) -> bool:
@@ -479,15 +504,18 @@ def _log_raw_failure(path: Path | None, *, slug: str, reason: str, raw: str) ->
479504
"""Append a raw failure record to ``path`` (JSONL). No-op when path is None.
480505
481506
Truncates raw payloads at 4000 chars so the file stays small but a
482-
representative sample is captured for B008-style debugging.
507+
representative sample is captured for B008-style debugging. The raw
508+
payload is run through ``strip_pii`` first — defense-in-depth in case
509+
a model ever echoes a credential or PII back in its output.
483510
"""
484511
if path is None:
485512
return
513+
sanitized = strip_pii((raw or "")[:4000])
486514
record = {
487515
"ts": datetime.now(UTC).isoformat(),
488516
"slug": slug,
489517
"reason": reason,
490-
"raw": (raw or "")[:4000],
518+
"raw": sanitized,
491519
}
492520
path.parent.mkdir(parents=True, exist_ok=True)
493521
with path.open("a") as f:

src/ycai/safe_http.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""SSRF guard for outbound HTTP fetches.
2+
3+
Every fetcher in this codebase pulls URLs that originated outside our
4+
control: yc-oss/api gives us company website URLs, the LLM cites
5+
``oss_evidence_url`` and traction-source URLs, the depth=1 crawler
6+
follows ``href`` attributes, and the link verifier re-fetches every cited
7+
URL before any artifact is published.
8+
9+
Without an SSRF guard, a poisoned upstream value or a hallucinated LLM
10+
output could direct the verifier to scan loopback / RFC1918 / link-local
11+
ranges — including AWS/GCP/Azure metadata endpoints
12+
(``169.254.169.254``) when this is run on cloud infrastructure.
13+
14+
Use ``is_safe_external_url`` before any outbound call. It rejects:
15+
- non-http/https schemes (file://, ftp://, gopher://, etc.)
16+
- malformed URLs (no host)
17+
- hostnames whose resolved IPs are loopback, link-local, private,
18+
reserved, multicast, or unspecified
19+
- a few well-known cloud metadata hostnames
20+
21+
The check resolves the host via DNS. The resolved-IP set is checked,
22+
not just the first record, to defend against DNS rebinding (where a
23+
hostname returns one address now and a different one on the next
24+
lookup). Callers that re-resolve later should either pin the address
25+
returned here or accept the residual risk.
26+
"""
27+
28+
from __future__ import annotations
29+
30+
import ipaddress
31+
import logging
32+
import socket
33+
from urllib.parse import urlparse
34+
35+
log = logging.getLogger(__name__)
36+
37+
# Hostnames that are not in private IP ranges but are known SSRF targets
38+
# (cloud-provider instance-metadata services).
39+
_BLOCKED_HOSTS: frozenset[str] = frozenset(
40+
{
41+
"metadata.google.internal",
42+
"metadata.goog",
43+
"metadata", # Some clouds resolve bare 'metadata' on the local network
44+
}
45+
)
46+
47+
# RFC 2606 reserved TLDs that are guaranteed never to resolve and are
48+
# explicitly intended for documentation, testing, and example use. We
49+
# allow them through the safety check because they're harmless (the
50+
# fetch will fail with a DNS error, never reach a real host) and tests
51+
# in this codebase rely on them. ``.localhost`` is *not* in this set —
52+
# it resolves to 127.0.0.1 and must be blocked.
53+
_RESERVED_TEST_TLDS: tuple[str, ...] = (".example", ".test", ".invalid")
54+
55+
56+
def _resolve_all(host: str) -> list[str]:
57+
"""Return every IP address ``host`` resolves to, or ``[]`` on failure."""
58+
try:
59+
infos = socket.getaddrinfo(host, None)
60+
except (socket.gaierror, UnicodeError):
61+
return []
62+
seen: set[str] = set()
63+
out: list[str] = []
64+
for info in infos:
65+
# sockaddr layout: IPv4 = (host, port); IPv6 = (host, port, flow, scope).
66+
sockaddr = info[4]
67+
ip = str(sockaddr[0])
68+
if ip not in seen:
69+
seen.add(ip)
70+
out.append(ip)
71+
return out
72+
73+
74+
def is_safe_external_url(url: str) -> bool:
75+
"""Return True if ``url`` is safe to fetch from an outbound HTTP client.
76+
77+
"Safe" here means: the URL resolves to a public, routable address
78+
on the open Internet — not loopback, not RFC1918, not link-local
79+
(which includes cloud metadata endpoints), not multicast, not
80+
reserved.
81+
"""
82+
try:
83+
parsed = urlparse(url)
84+
except ValueError:
85+
return False
86+
if parsed.scheme not in ("http", "https"):
87+
return False
88+
host = (parsed.hostname or "").lower()
89+
if not host:
90+
return False
91+
if host in _BLOCKED_HOSTS:
92+
log.debug("blocked SSRF host: %s", host)
93+
return False
94+
if any(host == tld[1:] or host.endswith(tld) for tld in _RESERVED_TEST_TLDS):
95+
# Reserved TLDs (RFC 2606): allowed because they cannot resolve to
96+
# a real host. The eventual fetch will fail with NXDOMAIN.
97+
return True
98+
# If the URL embeds a literal IP, check it directly without DNS.
99+
try:
100+
ip = ipaddress.ip_address(host)
101+
except ValueError:
102+
ips = _resolve_all(host)
103+
if not ips:
104+
log.debug("DNS resolution failed for %s; refusing fetch", host)
105+
return False
106+
return all(_ip_is_public(addr) for addr in ips)
107+
return _ip_is_public(str(ip))
108+
109+
110+
def _ip_is_public(ip_str: str) -> bool:
111+
"""True if ``ip_str`` is a public, routable address."""
112+
try:
113+
ip = ipaddress.ip_address(ip_str)
114+
except ValueError:
115+
return False
116+
if ip.is_loopback:
117+
return False
118+
if ip.is_private:
119+
return False
120+
if ip.is_link_local:
121+
return False
122+
if ip.is_multicast:
123+
return False
124+
if ip.is_reserved:
125+
return False
126+
return not ip.is_unspecified
127+
128+
129+
__all__ = ["is_safe_external_url"]

src/ycai/verifier.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111

1212
import httpx
1313

14+
from ycai.safe_http import is_safe_external_url
15+
1416
log = logging.getLogger(__name__)
1517

16-
Status = Literal["ok", "dead", "slow", "redirect", "error"]
18+
Status = Literal["ok", "dead", "slow", "redirect", "error", "blocked"]
1719

1820
DEFAULT_TIMEOUT = 6.0
1921
DEFAULT_CONCURRENCY = 16
@@ -22,7 +24,14 @@
2224

2325

2426
async def _check_one(client: httpx.AsyncClient, url: str) -> tuple[str, Status, str]:
25-
"""Return ``(url, status, reason)``. Never raises."""
27+
"""Return ``(url, status, reason)``. Never raises.
28+
29+
Refuses to fetch internal targets (loopback, RFC1918, link-local,
30+
cloud metadata) and reports them as ``blocked`` so the caller can
31+
distinguish ``blocked`` (we refused) from ``dead`` (the server said no).
32+
"""
33+
if not is_safe_external_url(url):
34+
return url, "blocked", "internal target"
2635
try:
2736
# HEAD first (cheaper). Some hosts 405 HEAD — fall back to GET.
2837
try:
@@ -84,6 +93,7 @@ def split_by_status(
8493
"slow": [],
8594
"redirect": [],
8695
"error": [],
96+
"blocked": [],
8797
}
8898
for url, (status, reason) in statuses.items():
8999
out[status].append((url, reason))

tests/test_crawler.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,20 @@ def test_crawl_company_handles_invalid_url_gracefully() -> None:
184184
assert result.pages == []
185185

186186

187+
def test_crawl_company_refuses_loopback_url() -> None:
188+
"""SSRF guard: a homepage pointing at loopback must be rejected before fetch."""
189+
result = asyncio.run(crawl_company("http://127.0.0.1:8080/"))
190+
assert result.error == "unsafe-homepage-url"
191+
assert result.pages == []
192+
193+
194+
def test_crawl_company_refuses_metadata_endpoint() -> None:
195+
"""SSRF guard: cloud metadata endpoints must be rejected."""
196+
result = asyncio.run(crawl_company("http://169.254.169.254/latest/meta-data/"))
197+
assert result.error == "unsafe-homepage-url"
198+
assert result.pages == []
199+
200+
187201
def test_crawl_company_skips_non_html_content_type() -> None:
188202
routes = {
189203
"https://acme.example/robots.txt": (200, "text/plain", "User-agent: *\nAllow: /\n"),

tests/test_researcher.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,48 @@ def test_validate_sources_rejects_invented_urls() -> None:
136136
assert _validate_sources(analysis, company) is False
137137

138138

139+
def test_validate_sources_rejects_subdomain_lookalike() -> None:
140+
"""An attacker-controlled subdomain that visually starts with the
141+
allowed website must NOT pass the source check.
142+
143+
The previous ``startswith`` implementation accepted
144+
``https://acme.ai.attacker.example/x`` against ``https://acme.ai`` —
145+
the host check rejects it because ``acme.ai.attacker.example`` is
146+
not the same host as ``acme.ai``.
147+
"""
148+
company = _make_company(slug="acme-ai", website="https://acme.ai")
149+
payload = json.loads(_good_response("acme-ai"))
150+
payload["sources"] = ["https://acme.ai.attacker.example/research"]
151+
analysis = _parse_response(json.dumps(payload), slug="acme-ai")
152+
assert analysis is not None
153+
assert _validate_sources(analysis, company) is False
154+
155+
156+
def test_validate_sources_accepts_www_variant() -> None:
157+
"""``www.acme.ai`` should be treated as the same host as ``acme.ai``."""
158+
company = _make_company(slug="acme-ai", website="https://acme.ai")
159+
payload = json.loads(_good_response("acme-ai"))
160+
payload["sources"] = ["https://www.acme.ai/about"]
161+
analysis = _parse_response(json.dumps(payload), slug="acme-ai")
162+
assert analysis is not None
163+
assert _validate_sources(analysis, company) is True
164+
165+
166+
def test_validate_sources_rejects_non_http_scheme() -> None:
167+
"""``file://`` / ``ftp://`` / ``javascript:`` cannot be cited as evidence."""
168+
company = _make_company(slug="acme-ai", website="https://acme.ai")
169+
payload = json.loads(_good_response("acme-ai"))
170+
payload["sources"] = ["file:///etc/passwd"]
171+
# pydantic HttpUrl will likely reject this earlier, but the guard
172+
# provides defense-in-depth even if a future schema relaxes.
173+
try:
174+
analysis = _parse_response(json.dumps(payload), slug="acme-ai")
175+
except Exception:
176+
return # pydantic rejection is also acceptable
177+
if analysis is not None:
178+
assert _validate_sources(analysis, company) is False
179+
180+
139181
# ----- analyze() with MockBackend: end-to-end flow --------------------------------------------
140182

141183

0 commit comments

Comments
 (0)