fix(tools): disclose read_url Jina dependency + cache opt-out (#122)

Teerapat-Vatpitak · web-flow · commit 1b547d23343a · 2026-05-16T14:51:27.000+08:00
- SKILL.md now explicitly notes that read_url forwards the full URL to the third-party Jina Reader (r.jina.ai), and warns against passing credentials or internal URLs.
- New optional no_cache=True parameter sends x-no-cache to bypass Jina's cache when freshness matters.
- Responses that contain Jina's cached-snapshot marker now surface a cached: true flag.
- Error paths keep the upstream HTTP body and exception text in the returned error string (and also log them via logger.warning) so operators can diagnose remote failures without grepping logs.
diff --git a/agent/src/skills/web-reader/SKILL.md b/agent/src/skills/web-reader/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: web-reader
-description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash.
+description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash. Sends the full URL to the third-party Jina Reader (r.jina.ai).
 category: tool
 ---
 # Web Reading
@@ -35,10 +35,23 @@ Returns JSON:
 ## Notes
 
 - Content longer than 8000 characters will be truncated, with the total length noted at the end
-- Some websites may block Jina Reader (returning HTTP 451). In that case, fall back to bash + requests
 - Dynamically rendered SPA pages may return only skeleton HTML
 - Chinese content is supported normally
 
+## Privacy & freshness
+
+- **Third-party dependency:** `read_url` forwards the full target URL
+  (including any query string) to the external Jina Reader service
+  (`r.jina.ai`). Do **not** pass URLs containing credentials, tokens, or
+  private/internal addresses — they would leave this host.
+- **Caching/staleness:** results may be a cached snapshot, not live data.
+  When stale, the JSON includes `"cached": true`; pass `no_cache=true` to
+  force a fresh fetch (slower — use only when freshness matters).
+- **Bash fallback caveat:** if a site blocks the reader (e.g. HTTP 451) a
+  manual `bash + requests` fetch is possible, but it **bypasses this
+  tool's URL safety guard and the Jina layer** — use sparingly and never
+  for internal/authenticated URLs.
+
 ## Common Usage
 
 ### Read API Documentation
diff --git a/agent/src/tools/web_reader_tool.py b/agent/src/tools/web_reader_tool.py
@@ -4,15 +4,19 @@
 
 import ipaddress
 import json
+import logging
 from urllib.parse import urlsplit
 
 import requests
 
 from src.agent.tools import BaseTool
 
+logger = logging.getLogger(__name__)
+
 _JINA_PREFIX = "https://r.jina.ai/"
 _TIMEOUT = 30
 _MAX_LENGTH = 8000
+_CACHED_MARKER = "Warning: This is a cached snapshot"
 
 
 def _url_allowed(url: str) -> tuple[bool, str]:
@@ -52,30 +56,40 @@ def _url_allowed(url: str) -> tuple[bool, str]:
     return True, ""
 
 
-def read_url(url: str) -> str:
+def read_url(url: str, no_cache: bool = False) -> str:
     """Fetch web page content via the Jina Reader API.
 
+    The full URL (including query string) is sent to the third-party Jina
+    Reader service (r.jina.ai); never pass credentials/tokens or private
+    addresses. Results may be a cached snapshot.
+
     Args:
         url: Target URL.
+        no_cache: When true, ask the reader for a fresh (uncached) fetch.
 
     Returns:
-        JSON-formatted result containing title, content, and url.
+        JSON result with title, content, url; ``cached: true`` is added
+        when the reader served a stale snapshot.
     """
     target_url = url.strip()
     allowed, error = _url_allowed(target_url)
     if not allowed:
         return json.dumps({"status": "error", "error": error}, ensure_ascii=False)
 
     try:
+        headers = {"Accept": "text/markdown"}
+        if no_cache:
+            headers["x-no-cache"] = "true"
         resp = requests.get(
             f"{_JINA_PREFIX}{target_url}",
-            headers={"Accept": "text/markdown"},
+            headers=headers,
             timeout=_TIMEOUT,
         )
         if resp.status_code != 200:
+            logger.warning("read_url upstream HTTP %s: %s", resp.status_code, resp.text[:500])
             return json.dumps({
                 "status": "error",
-                "error": f"Jina Reader returned {resp.status_code}: {resp.text[:500]}",
+                "error": f"remote reader returned HTTP {resp.status_code}: {resp.text[:500]}",
             }, ensure_ascii=False)
 
         text = resp.text
@@ -88,18 +102,25 @@ def read_url(url: str) -> str:
         if len(text) > _MAX_LENGTH:
             text = text[:_MAX_LENGTH] + f"\n\n... (truncated, total {len(resp.text)} chars)"
 
-        return json.dumps({
+        result = {
             "status": "ok",
             "title": title,
             "url": target_url,
             "content": text,
             "length": len(resp.text),
-        }, ensure_ascii=False)
+        }
+        if _CACHED_MARKER in resp.text:
+            result["cached"] = True
+        return json.dumps(result, ensure_ascii=False)
 
     except requests.Timeout:
         return json.dumps({"status": "error", "error": f"Request timed out ({_TIMEOUT}s)"}, ensure_ascii=False)
     except Exception as exc:
-        return json.dumps({"status": "error", "error": str(exc)}, ensure_ascii=False)
+        logger.warning("read_url request failed: %s", exc)
+        return json.dumps(
+            {"status": "error", "error": f"remote reader request failed: {exc}"},
+            ensure_ascii=False,
+        )
 
 
 class WebReaderTool(BaseTool):
@@ -111,11 +132,12 @@ class WebReaderTool(BaseTool):
         "type": "object",
         "properties": {
             "url": {"type": "string", "description": "URL of the web page to read"},
+            "no_cache": {"type": "boolean", "description": "Request a fresh (uncached) fetch", "default": False},
         },
         "required": ["url"],
     }
     repeatable = True
 
     def execute(self, **kwargs) -> str:
         """Fetch web page."""
-        return read_url(kwargs["url"])
+        return read_url(kwargs["url"], no_cache=bool(kwargs.get("no_cache", False)))
diff --git a/agent/tests/test_web_reader_privacy.py b/agent/tests/test_web_reader_privacy.py
@@ -0,0 +1,77 @@
+"""Regression tests for read_url third-party (Jina) hardening.
+
+Network is mocked; no live r.jina.ai calls. Asserts: HTTP errors surface
+the upstream status + body for debugging; a cached snapshot is surfaced
+via `cached: true`; `no_cache=True` sends the x-no-cache header while
+the default path is byte-identical (no extra header).
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+import src.tools.web_reader_tool as wr
+from src.tools.web_reader_tool import read_url
+
+URL = "https://example.com/page"
+
+
+class _Resp:
+    def __init__(self, status_code=200, text=""):
+        self.status_code = status_code
+        self.text = text
+
+
+@pytest.fixture
+def captured(monkeypatch):
+    box = {}
+
+    def fake_get(url, headers=None, timeout=None):
+        box["url"] = url
+        box["headers"] = headers or {}
+        r = box["resp"]
+        if isinstance(r, BaseException):
+            raise r
+        return r
+
+    monkeypatch.setattr(wr.requests, "get", fake_get)
+    return box
+
+
+def test_http_error_surfaces_status_and_body(captured):
+    captured["resp"] = _Resp(451, "ParamValidationError: bad input")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "error"
+    assert "451" in out["error"]
+    assert "ParamValidationError: bad input" in out["error"]
+
+
+def test_exception_error_surfaces_exc_text(captured):
+    captured["resp"] = RuntimeError("boom: connect failed (10.0.0.1)")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "error"
+    assert "boom: connect failed" in out["error"]
+
+
+def test_cached_snapshot_is_flagged(captured):
+    captured["resp"] = _Resp(200, "Title: X\n\nWarning: This is a cached snapshot\n\nbody")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "ok"
+    assert out.get("cached") is True
+
+
+def test_fresh_response_has_no_cached_key(captured):
+    captured["resp"] = _Resp(200, "Title: X\n\nlive body content")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "ok"
+    assert "cached" not in out  # additive: absent on the normal path
+
+
+def test_no_cache_header_opt_in_only(captured):
+    captured["resp"] = _Resp(200, "Title: X\n\nbody")
+    read_url(URL)  # default
+    assert "x-no-cache" not in {k.lower() for k in captured["headers"]}
+    read_url(URL, no_cache=True)
+    assert captured["headers"].get("x-no-cache") == "true"