Skip to content

Commit b6ae19d

Browse files
committed
Fix recovery and citation safety
1 parent f13d600 commit b6ae19d

6 files changed

Lines changed: 188 additions & 10 deletions

File tree

src/compendiumscribe/compendium/html_site_renderer.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import html
66
from typing import TYPE_CHECKING
7+
from urllib.parse import urlsplit
78

89
from .text_utils import format_html_text, slugify
910

@@ -28,6 +29,21 @@ def _html_head(title: str, depth: int = 0) -> list[str]:
2829
]
2930

3031

32+
def _safe_citation_url(url: str | None) -> str | None:
33+
if not url:
34+
return None
35+
stripped = url.strip()
36+
if not stripped:
37+
return None
38+
parsed = urlsplit(stripped)
39+
scheme = parsed.scheme.lower()
40+
if scheme in {"http", "https"} and parsed.netloc:
41+
return stripped
42+
if scheme == "mailto" and parsed.path:
43+
return stripped
44+
return None
45+
46+
3147
def _nav_links(
3248
current: str,
3349
sections: list["Section"],
@@ -241,11 +257,16 @@ def _render_citations_page(compendium: "Compendium") -> str:
241257
f" <h2>[{html.escape(citation.identifier)}] "
242258
f"{format_html_text(citation.title)}</h2>"
243259
)
244-
parts.append(
245-
f' <p><a href="{html.escape(citation.url)}" '
246-
f'rel="noopener noreferrer">'
247-
f"{html.escape(citation.url)}</a></p>"
248-
)
260+
safe_url = _safe_citation_url(citation.url)
261+
escaped_url = html.escape(citation.url)
262+
if safe_url:
263+
parts.append(
264+
f' <p><a href="{html.escape(safe_url)}" '
265+
f'rel="noopener noreferrer">'
266+
f"{escaped_url}</a></p>"
267+
)
268+
else:
269+
parts.append(f" <p>{escaped_url}</p>")
249270
details: list[str] = []
250271
if citation.publisher:
251272
details.append(html.escape(citation.publisher))

src/compendiumscribe/research/agents_workflow/source_ledger.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,21 @@
1111

1212

1313
def normalize_url(url: str) -> str:
14-
parsed = urlsplit(url.strip())
14+
raw_url = url.strip()
15+
parsed = urlsplit(raw_url)
16+
if not parsed.scheme and not parsed.netloc and _looks_like_host_path(parsed.path):
17+
parsed = urlsplit(f"https://{raw_url}")
1518
scheme = parsed.scheme.lower() or "https"
1619
netloc = parsed.netloc.lower()
1720
path = parsed.path.rstrip("/")
1821
return urlunsplit((scheme, netloc, path, "", ""))
1922

2023

24+
def _looks_like_host_path(path: str) -> bool:
25+
host = path.split("/", 1)[0]
26+
return bool(host) and ("." in host or host == "localhost")
27+
28+
2129
def build_source_ledger(
2230
briefs: list[SectionResearchBrief],
2331
*,
Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
from __future__ import annotations
22

3+
from contextlib import suppress
4+
import os
35
from pathlib import Path
6+
from uuid import uuid4
47

58
from .artifacts import ResearchRunState
69

@@ -10,11 +13,20 @@ def load_state(path: Path) -> ResearchRunState:
1013

1114

1215
def save_state(path: Path, state: ResearchRunState) -> None:
16+
payload = state.model_dump_json(indent=2) + "\n"
1317
path.parent.mkdir(parents=True, exist_ok=True)
14-
path.write_text(
15-
state.model_dump_json(indent=2) + "\n",
16-
encoding="utf-8",
17-
)
18+
temp_path = path.with_name(f".{path.name}.{uuid4().hex}.tmp")
19+
20+
try:
21+
with temp_path.open("w", encoding="utf-8") as state_file:
22+
state_file.write(payload)
23+
state_file.flush()
24+
os.fsync(state_file.fileno())
25+
os.replace(temp_path, path)
26+
except Exception:
27+
with suppress(OSError):
28+
temp_path.unlink()
29+
raise
1830

1931

2032
__all__ = ["load_state", "save_state"]

tests/compendium/test_html_site_renderer.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,52 @@ def test_citations_page_lists_all_citations():
133133
assert "2024-01-15" in citations
134134

135135

136+
def test_citations_page_links_only_safe_url_schemes():
137+
"""Verify citation URLs with unsafe schemes are rendered as text."""
138+
compendium = Compendium(
139+
topic="Citation URL Safety",
140+
overview="Overview.",
141+
citations=[
142+
Citation(
143+
identifier="C1",
144+
title="HTTPS Source",
145+
url="https://example.com/source",
146+
),
147+
Citation(
148+
identifier="C2",
149+
title="Email Source",
150+
url="mailto:research@example.com",
151+
),
152+
Citation(
153+
identifier="C3",
154+
title="Script Source",
155+
url="javascript:alert(1)",
156+
),
157+
Citation(
158+
identifier="C4",
159+
title="Data Source",
160+
url="data:text/html,hello",
161+
),
162+
Citation(
163+
identifier="C5",
164+
title="Malformed Source",
165+
url="https:example.com/source",
166+
),
167+
],
168+
)
169+
170+
citations = compendium.to_html_site()["citations.html"]
171+
172+
assert 'href="https://example.com/source"' in citations
173+
assert 'href="mailto:research@example.com"' in citations
174+
assert "javascript:alert(1)" in citations
175+
assert "data:text/html,hello" in citations
176+
assert "https:example.com/source" in citations
177+
assert 'href="javascript:alert(1)"' not in citations
178+
assert 'href="data:text/html,hello"' not in citations
179+
assert 'href="https:example.com/source"' not in citations
180+
181+
136182
def test_open_questions_page_lists_all_questions():
137183
"""Verify open questions page contains all questions."""
138184
compendium = _sample_compendium()

tests/research/test_agents_artifacts.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
SourceLedger,
99
SourceLedgerEntry,
1010
build_source_ledger,
11+
mark_cited_sources,
12+
normalize_url,
1113
prepare_compendium_payload,
1214
validate_compendium_citations,
1315
)
@@ -82,6 +84,49 @@ def test_source_ledger_deduplicates_urls_and_keeps_section_usage() -> None:
8284
assert ledger.entries[0].status == "cited"
8385

8486

87+
def test_normalize_url_promotes_scheme_less_hosts_to_https() -> None:
88+
assert normalize_url("example.com/source") == "https://example.com/source"
89+
assert normalize_url("localhost/report/") == "https://localhost/report"
90+
assert normalize_url("http://Example.com/report/") == (
91+
"http://example.com/report"
92+
)
93+
94+
95+
def test_source_ledger_matches_scheme_less_source_urls_to_cited_urls() -> None:
96+
brief = SectionResearchBrief(
97+
section_id="s1",
98+
title="One",
99+
summary="Summary",
100+
findings=[
101+
{
102+
"title": "Finding",
103+
"evidence": "Evidence",
104+
"source_urls": ["https://example.com/source"],
105+
}
106+
],
107+
sources=[
108+
{
109+
"title": "Source",
110+
"url": "example.com/source",
111+
"status": "consulted",
112+
}
113+
],
114+
)
115+
ledger = build_source_ledger([brief])
116+
117+
mark_cited_sources(
118+
ledger,
119+
[
120+
source_url
121+
for finding in brief.findings
122+
for source_url in finding.source_urls
123+
],
124+
)
125+
126+
assert ledger.entries[0].url == "https://example.com/source"
127+
assert ledger.entries[0].status == "cited"
128+
129+
85130
def test_rejected_and_consulted_only_sources_cannot_be_final_citations() -> None:
86131
brief = SectionResearchBrief(
87132
section_id="s1",

tests/research/test_state.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
from unittest import mock
5+
6+
import pytest
7+
8+
from compendiumscribe.research.agents_workflow.artifacts import ResearchRunState
9+
from compendiumscribe.research.agents_workflow.state import load_state, save_state
10+
11+
12+
def test_save_state_writes_loadable_sidecar(tmp_path: Path) -> None:
13+
state_path = tmp_path / "report.research.json"
14+
state = ResearchRunState(
15+
topic="Atomic Recovery",
16+
title="Atomic Recovery",
17+
output_formats=["md"],
18+
)
19+
20+
save_state(state_path, state)
21+
22+
loaded = load_state(state_path)
23+
assert loaded.run_id == state.run_id
24+
assert loaded.topic == "Atomic Recovery"
25+
assert loaded.output_formats == ["md"]
26+
27+
28+
def test_save_state_keeps_existing_sidecar_when_replace_fails(
29+
tmp_path: Path,
30+
) -> None:
31+
state_path = tmp_path / "report.research.json"
32+
original = ResearchRunState(topic="Original", title="Original")
33+
updated = ResearchRunState(topic="Updated", title="Updated")
34+
save_state(state_path, original)
35+
original_payload = state_path.read_text(encoding="utf-8")
36+
37+
with mock.patch(
38+
"compendiumscribe.research.agents_workflow.state.os.replace",
39+
side_effect=OSError("replace failed"),
40+
):
41+
with pytest.raises(OSError, match="replace failed"):
42+
save_state(state_path, updated)
43+
44+
assert state_path.read_text(encoding="utf-8") == original_payload
45+
assert load_state(state_path).topic == "Original"
46+
assert not list(tmp_path.glob(".report.research.json.*.tmp"))

0 commit comments

Comments
 (0)