Merge remote-tracking branch 'origin/main'

warren618 · warren618 · commit 98afcd93836b · 2026-05-16T17:10:58.000+08:00
# Conflicts:
#	agent/src/tools/web_reader_tool.py
diff --git a/agent/src/skills/web-reader/SKILL.md b/agent/src/skills/web-reader/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: web-reader
-description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash.
+description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash. Sends the full URL to the third-party Jina Reader (r.jina.ai).
 category: tool
 ---
 # Web Reading
@@ -35,10 +35,23 @@ Returns JSON:
 ## Notes
 
 - Content longer than 8000 characters will be truncated, with the total length noted at the end
-- Some websites may block Jina Reader (returning HTTP 451). In that case, fall back to bash + requests
 - Dynamically rendered SPA pages may return only skeleton HTML
 - Chinese content is supported normally
 
+## Privacy & freshness
+
+- **Third-party dependency:** `read_url` forwards the full target URL
+  (including any query string) to the external Jina Reader service
+  (`r.jina.ai`). Do **not** pass URLs containing credentials, tokens, or
+  private/internal addresses — they would leave this host.
+- **Caching/staleness:** results may be a cached snapshot, not live data.
+  When stale, the JSON includes `"cached": true`; pass `no_cache=true` to
+  force a fresh fetch (slower — use only when freshness matters).
+- **Bash fallback caveat:** if a site blocks the reader (e.g. HTTP 451) a
+  manual `bash + requests` fetch is possible, but it **bypasses this
+  tool's URL safety guard and the Jina layer** — use sparingly and never
+  for internal/authenticated URLs.
+
 ## Common Usage
 
 ### Read API Documentation
diff --git a/agent/src/tools/web_reader_tool.py b/agent/src/tools/web_reader_tool.py
@@ -4,16 +4,20 @@
 
 import ipaddress
 import json
+import logging
 from urllib.parse import urlsplit
 
 import requests
 
 from src.agent.tools import BaseTool
 from src.security.scanner import with_security_warnings
 
+logger = logging.getLogger(__name__)
+
 _JINA_PREFIX = "https://r.jina.ai/"
 _TIMEOUT = 30
 _MAX_LENGTH = 8000
+_CACHED_MARKER = "Warning: This is a cached snapshot"
 
 
 def _url_allowed(url: str) -> tuple[bool, str]:
@@ -53,30 +57,40 @@ def _url_allowed(url: str) -> tuple[bool, str]:
     return True, ""
 
 
-def read_url(url: str) -> str:
+def read_url(url: str, no_cache: bool = False) -> str:
     """Fetch web page content via the Jina Reader API.
 
+    The full URL (including query string) is sent to the third-party Jina
+    Reader service (r.jina.ai); never pass credentials/tokens or private
+    addresses. Results may be a cached snapshot.
+
     Args:
         url: Target URL.
+        no_cache: When true, ask the reader for a fresh (uncached) fetch.
 
     Returns:
-        JSON-formatted result containing title, content, and url.
+        JSON result with title, content, url; ``cached: true`` is added
+        when the reader served a stale snapshot.
     """
     target_url = url.strip()
     allowed, error = _url_allowed(target_url)
     if not allowed:
         return json.dumps({"status": "error", "error": error}, ensure_ascii=False)
 
     try:
+        headers = {"Accept": "text/markdown"}
+        if no_cache:
+            headers["x-no-cache"] = "true"
         resp = requests.get(
             f"{_JINA_PREFIX}{target_url}",
-            headers={"Accept": "text/markdown"},
+            headers=headers,
             timeout=_TIMEOUT,
         )
         if resp.status_code != 200:
+            logger.warning("read_url upstream HTTP %s: %s", resp.status_code, resp.text[:500])
             return json.dumps({
                 "status": "error",
-                "error": f"Jina Reader returned {resp.status_code}: {resp.text[:500]}",
+                "error": f"remote reader returned HTTP {resp.status_code}: {resp.text[:500]}",
             }, ensure_ascii=False)
 
         text = resp.text
@@ -89,20 +103,26 @@ def read_url(url: str) -> str:
         if len(text) > _MAX_LENGTH:
             text = text[:_MAX_LENGTH] + f"\n\n... (truncated, total {len(resp.text)} chars)"
 
-        payload = {
+        result = {
             "status": "ok",
             "title": title,
             "url": target_url,
             "content": text,
             "length": len(resp.text),
         }
-        payload = with_security_warnings(payload, fields=("content",))
-        return json.dumps(payload, ensure_ascii=False)
+        if _CACHED_MARKER in resp.text:
+            result["cached"] = True
+        result = with_security_warnings(result, fields=("content",))
+        return json.dumps(result, ensure_ascii=False)
 
     except requests.Timeout:
         return json.dumps({"status": "error", "error": f"Request timed out ({_TIMEOUT}s)"}, ensure_ascii=False)
     except Exception as exc:
-        return json.dumps({"status": "error", "error": str(exc)}, ensure_ascii=False)
+        logger.warning("read_url request failed: %s", exc)
+        return json.dumps(
+            {"status": "error", "error": f"remote reader request failed: {exc}"},
+            ensure_ascii=False,
+        )
 
 
 class WebReaderTool(BaseTool):
@@ -114,11 +134,12 @@ class WebReaderTool(BaseTool):
         "type": "object",
         "properties": {
             "url": {"type": "string", "description": "URL of the web page to read"},
+            "no_cache": {"type": "boolean", "description": "Request a fresh (uncached) fetch", "default": False},
         },
         "required": ["url"],
     }
     repeatable = True
 
     def execute(self, **kwargs) -> str:
         """Fetch web page."""
-        return read_url(kwargs["url"])
+        return read_url(kwargs["url"], no_cache=bool(kwargs.get("no_cache", False)))
diff --git a/agent/tests/test_web_reader_privacy.py b/agent/tests/test_web_reader_privacy.py
@@ -0,0 +1,77 @@
+"""Regression tests for read_url third-party (Jina) hardening.
+
+Network is mocked; no live r.jina.ai calls. Asserts: HTTP errors surface
+the upstream status + body for debugging; a cached snapshot is surfaced
+via `cached: true`; `no_cache=True` sends the x-no-cache header while
+the default path is byte-identical (no extra header).
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+import src.tools.web_reader_tool as wr
+from src.tools.web_reader_tool import read_url
+
+URL = "https://example.com/page"
+
+
+class _Resp:
+    def __init__(self, status_code=200, text=""):
+        self.status_code = status_code
+        self.text = text
+
+
+@pytest.fixture
+def captured(monkeypatch):
+    box = {}
+
+    def fake_get(url, headers=None, timeout=None):
+        box["url"] = url
+        box["headers"] = headers or {}
+        r = box["resp"]
+        if isinstance(r, BaseException):
+            raise r
+        return r
+
+    monkeypatch.setattr(wr.requests, "get", fake_get)
+    return box
+
+
+def test_http_error_surfaces_status_and_body(captured):
+    captured["resp"] = _Resp(451, "ParamValidationError: bad input")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "error"
+    assert "451" in out["error"]
+    assert "ParamValidationError: bad input" in out["error"]
+
+
+def test_exception_error_surfaces_exc_text(captured):
+    captured["resp"] = RuntimeError("boom: connect failed (10.0.0.1)")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "error"
+    assert "boom: connect failed" in out["error"]
+
+
+def test_cached_snapshot_is_flagged(captured):
+    captured["resp"] = _Resp(200, "Title: X\n\nWarning: This is a cached snapshot\n\nbody")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "ok"
+    assert out.get("cached") is True
+
+
+def test_fresh_response_has_no_cached_key(captured):
+    captured["resp"] = _Resp(200, "Title: X\n\nlive body content")
+    out = json.loads(read_url(URL))
+    assert out["status"] == "ok"
+    assert "cached" not in out  # additive: absent on the normal path
+
+
+def test_no_cache_header_opt_in_only(captured):
+    captured["resp"] = _Resp(200, "Title: X\n\nbody")
+    read_url(URL)  # default
+    assert "x-no-cache" not in {k.lower() for k in captured["headers"]}
+    read_url(URL, no_cache=True)
+    assert captured["headers"].get("x-no-cache") == "true"