Fix recovery and citation safety

btfranklin · btfranklin · commit b6ae19d86536 · 2026-05-06T18:16:56.000-07:00
diff --git a/src/compendiumscribe/compendium/html_site_renderer.py b/src/compendiumscribe/compendium/html_site_renderer.py
@@ -4,6 +4,7 @@
 
 import html
 from typing import TYPE_CHECKING
+from urllib.parse import urlsplit
 
 from .text_utils import format_html_text, slugify
 
@@ -28,6 +29,21 @@ def _html_head(title: str, depth: int = 0) -> list[str]:
     ]
 
 
+def _safe_citation_url(url: str | None) -> str | None:
+    if not url:
+        return None
+    stripped = url.strip()
+    if not stripped:
+        return None
+    parsed = urlsplit(stripped)
+    scheme = parsed.scheme.lower()
+    if scheme in {"http", "https"} and parsed.netloc:
+        return stripped
+    if scheme == "mailto" and parsed.path:
+        return stripped
+    return None
+
+
 def _nav_links(
     current: str,
     sections: list["Section"],
@@ -241,11 +257,16 @@ def _render_citations_page(compendium: "Compendium") -> str:
                 f"        <h2>[{html.escape(citation.identifier)}] "
                 f"{format_html_text(citation.title)}</h2>"
             )
-            parts.append(
-                f'        <p><a href="{html.escape(citation.url)}" '
-                f'rel="noopener noreferrer">'
-                f"{html.escape(citation.url)}</a></p>"
-            )
+            safe_url = _safe_citation_url(citation.url)
+            escaped_url = html.escape(citation.url)
+            if safe_url:
+                parts.append(
+                    f'        <p><a href="{html.escape(safe_url)}" '
+                    f'rel="noopener noreferrer">'
+                    f"{escaped_url}</a></p>"
+                )
+            else:
+                parts.append(f"        <p>{escaped_url}</p>")
             details: list[str] = []
             if citation.publisher:
                 details.append(html.escape(citation.publisher))
diff --git a/src/compendiumscribe/research/agents_workflow/source_ledger.py b/src/compendiumscribe/research/agents_workflow/source_ledger.py
@@ -11,13 +11,21 @@
 
 
 def normalize_url(url: str) -> str:
-    parsed = urlsplit(url.strip())
+    raw_url = url.strip()
+    parsed = urlsplit(raw_url)
+    if not parsed.scheme and not parsed.netloc and _looks_like_host_path(parsed.path):
+        parsed = urlsplit(f"https://{raw_url}")
     scheme = parsed.scheme.lower() or "https"
     netloc = parsed.netloc.lower()
     path = parsed.path.rstrip("/")
     return urlunsplit((scheme, netloc, path, "", ""))
 
 
+def _looks_like_host_path(path: str) -> bool:
+    host = path.split("/", 1)[0]
+    return bool(host) and ("." in host or host == "localhost")
+
+
 def build_source_ledger(
     briefs: list[SectionResearchBrief],
     *,
diff --git a/src/compendiumscribe/research/agents_workflow/state.py b/src/compendiumscribe/research/agents_workflow/state.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
+from contextlib import suppress
+import os
 from pathlib import Path
+from uuid import uuid4
 
 from .artifacts import ResearchRunState
 
@@ -10,11 +13,20 @@ def load_state(path: Path) -> ResearchRunState:
 
 
 def save_state(path: Path, state: ResearchRunState) -> None:
+    payload = state.model_dump_json(indent=2) + "\n"
     path.parent.mkdir(parents=True, exist_ok=True)
-    path.write_text(
-        state.model_dump_json(indent=2) + "\n",
-        encoding="utf-8",
-    )
+    temp_path = path.with_name(f".{path.name}.{uuid4().hex}.tmp")
+
+    try:
+        with temp_path.open("w", encoding="utf-8") as state_file:
+            state_file.write(payload)
+            state_file.flush()
+            os.fsync(state_file.fileno())
+        os.replace(temp_path, path)
+    except Exception:
+        with suppress(OSError):
+            temp_path.unlink()
+        raise
 
 
 __all__ = ["load_state", "save_state"]
diff --git a/tests/compendium/test_html_site_renderer.py b/tests/compendium/test_html_site_renderer.py
@@ -133,6 +133,52 @@ def test_citations_page_lists_all_citations():
     assert "2024-01-15" in citations
 
 
+def test_citations_page_links_only_safe_url_schemes():
+    """Verify citation URLs with unsafe schemes are rendered as text."""
+    compendium = Compendium(
+        topic="Citation URL Safety",
+        overview="Overview.",
+        citations=[
+            Citation(
+                identifier="C1",
+                title="HTTPS Source",
+                url="https://example.com/source",
+            ),
+            Citation(
+                identifier="C2",
+                title="Email Source",
+                url="mailto:research@example.com",
+            ),
+            Citation(
+                identifier="C3",
+                title="Script Source",
+                url="javascript:alert(1)",
+            ),
+            Citation(
+                identifier="C4",
+                title="Data Source",
+                url="data:text/html,hello",
+            ),
+            Citation(
+                identifier="C5",
+                title="Malformed Source",
+                url="https:example.com/source",
+            ),
+        ],
+    )
+
+    citations = compendium.to_html_site()["citations.html"]
+
+    assert 'href="https://example.com/source"' in citations
+    assert 'href="mailto:research@example.com"' in citations
+    assert "javascript:alert(1)" in citations
+    assert "data:text/html,hello" in citations
+    assert "https:example.com/source" in citations
+    assert 'href="javascript:alert(1)"' not in citations
+    assert 'href="data:text/html,hello"' not in citations
+    assert 'href="https:example.com/source"' not in citations
+
+
 def test_open_questions_page_lists_all_questions():
     """Verify open questions page contains all questions."""
     compendium = _sample_compendium()
diff --git a/tests/research/test_agents_artifacts.py b/tests/research/test_agents_artifacts.py
@@ -8,6 +8,8 @@
     SourceLedger,
     SourceLedgerEntry,
     build_source_ledger,
+    mark_cited_sources,
+    normalize_url,
     prepare_compendium_payload,
     validate_compendium_citations,
 )
@@ -82,6 +84,49 @@ def test_source_ledger_deduplicates_urls_and_keeps_section_usage() -> None:
     assert ledger.entries[0].status == "cited"
 
 
+def test_normalize_url_promotes_scheme_less_hosts_to_https() -> None:
+    assert normalize_url("example.com/source") == "https://example.com/source"
+    assert normalize_url("localhost/report/") == "https://localhost/report"
+    assert normalize_url("http://Example.com/report/") == (
+        "http://example.com/report"
+    )
+
+
+def test_source_ledger_matches_scheme_less_source_urls_to_cited_urls() -> None:
+    brief = SectionResearchBrief(
+        section_id="s1",
+        title="One",
+        summary="Summary",
+        findings=[
+            {
+                "title": "Finding",
+                "evidence": "Evidence",
+                "source_urls": ["https://example.com/source"],
+            }
+        ],
+        sources=[
+            {
+                "title": "Source",
+                "url": "example.com/source",
+                "status": "consulted",
+            }
+        ],
+    )
+    ledger = build_source_ledger([brief])
+
+    mark_cited_sources(
+        ledger,
+        [
+            source_url
+            for finding in brief.findings
+            for source_url in finding.source_urls
+        ],
+    )
+
+    assert ledger.entries[0].url == "https://example.com/source"
+    assert ledger.entries[0].status == "cited"
+
+
 def test_rejected_and_consulted_only_sources_cannot_be_final_citations() -> None:
     brief = SectionResearchBrief(
         section_id="s1",
diff --git a/tests/research/test_state.py b/tests/research/test_state.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+from compendiumscribe.research.agents_workflow.artifacts import ResearchRunState
+from compendiumscribe.research.agents_workflow.state import load_state, save_state
+
+
+def test_save_state_writes_loadable_sidecar(tmp_path: Path) -> None:
+    state_path = tmp_path / "report.research.json"
+    state = ResearchRunState(
+        topic="Atomic Recovery",
+        title="Atomic Recovery",
+        output_formats=["md"],
+    )
+
+    save_state(state_path, state)
+
+    loaded = load_state(state_path)
+    assert loaded.run_id == state.run_id
+    assert loaded.topic == "Atomic Recovery"
+    assert loaded.output_formats == ["md"]
+
+
+def test_save_state_keeps_existing_sidecar_when_replace_fails(
+    tmp_path: Path,
+) -> None:
+    state_path = tmp_path / "report.research.json"
+    original = ResearchRunState(topic="Original", title="Original")
+    updated = ResearchRunState(topic="Updated", title="Updated")
+    save_state(state_path, original)
+    original_payload = state_path.read_text(encoding="utf-8")
+
+    with mock.patch(
+        "compendiumscribe.research.agents_workflow.state.os.replace",
+        side_effect=OSError("replace failed"),
+    ):
+        with pytest.raises(OSError, match="replace failed"):
+            save_state(state_path, updated)
+
+    assert state_path.read_text(encoding="utf-8") == original_payload
+    assert load_state(state_path).topic == "Original"
+    assert not list(tmp_path.glob(".report.research.json.*.tmp"))