Skip to content

Commit 1b547d2

Browse files
fix(tools): disclose read_url Jina dependency + cache opt-out (#122)
- SKILL.md now explicitly notes that read_url forwards the full URL to the third-party Jina Reader (r.jina.ai), and warns against passing credentials or internal URLs. - New optional no_cache=True parameter sends x-no-cache to bypass Jina's cache when freshness matters. - Responses that contain Jina's cached-snapshot marker now surface a cached: true flag. - Error paths keep the upstream HTTP body and exception text in the returned error string (and also log them via logger.warning) so operators can diagnose remote failures without grepping logs.
1 parent d20b590 commit 1b547d2

3 files changed

Lines changed: 122 additions & 10 deletions

File tree

agent/src/skills/web-reader/SKILL.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
name: web-reader
3-
description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash.
3+
description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash. Sends the full URL to the third-party Jina Reader (r.jina.ai).
44
category: tool
55
---
66
# Web Reading
@@ -35,10 +35,23 @@ Returns JSON:
3535
## Notes
3636

3737
- Content longer than 8000 characters will be truncated, with the total length noted at the end
38-
- Some websites may block Jina Reader (returning HTTP 451). In that case, fall back to bash + requests
3938
- Dynamically rendered SPA pages may return only skeleton HTML
4039
- Chinese content is supported normally
4140

41+
## Privacy & freshness
42+
43+
- **Third-party dependency:** `read_url` forwards the full target URL
44+
(including any query string) to the external Jina Reader service
45+
(`r.jina.ai`). Do **not** pass URLs containing credentials, tokens, or
46+
private/internal addresses — they would leave this host.
47+
- **Caching/staleness:** results may be a cached snapshot, not live data.
48+
When stale, the JSON includes `"cached": true`; pass `no_cache=true` to
49+
force a fresh fetch (slower — use only when freshness matters).
50+
- **Bash fallback caveat:** if a site blocks the reader (e.g. HTTP 451) a
51+
manual `bash + requests` fetch is possible, but it **bypasses this
52+
tool's URL safety guard and the Jina layer** — use sparingly and never
53+
for internal/authenticated URLs.
54+
4255
## Common Usage
4356

4457
### Read API Documentation

agent/src/tools/web_reader_tool.py

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,19 @@
44

55
import ipaddress
66
import json
7+
import logging
78
from urllib.parse import urlsplit
89

910
import requests
1011

1112
from src.agent.tools import BaseTool
1213

14+
logger = logging.getLogger(__name__)
15+
1316
_JINA_PREFIX = "https://r.jina.ai/"
1417
_TIMEOUT = 30
1518
_MAX_LENGTH = 8000
19+
_CACHED_MARKER = "Warning: This is a cached snapshot"
1620

1721

1822
def _url_allowed(url: str) -> tuple[bool, str]:
@@ -52,30 +56,40 @@ def _url_allowed(url: str) -> tuple[bool, str]:
5256
return True, ""
5357

5458

55-
def read_url(url: str) -> str:
59+
def read_url(url: str, no_cache: bool = False) -> str:
5660
"""Fetch web page content via the Jina Reader API.
5761
62+
The full URL (including query string) is sent to the third-party Jina
63+
Reader service (r.jina.ai); never pass credentials/tokens or private
64+
addresses. Results may be a cached snapshot.
65+
5866
Args:
5967
url: Target URL.
68+
no_cache: When true, ask the reader for a fresh (uncached) fetch.
6069
6170
Returns:
62-
JSON-formatted result containing title, content, and url.
71+
JSON result with title, content, url; ``cached: true`` is added
72+
when the reader served a stale snapshot.
6373
"""
6474
target_url = url.strip()
6575
allowed, error = _url_allowed(target_url)
6676
if not allowed:
6777
return json.dumps({"status": "error", "error": error}, ensure_ascii=False)
6878

6979
try:
80+
headers = {"Accept": "text/markdown"}
81+
if no_cache:
82+
headers["x-no-cache"] = "true"
7083
resp = requests.get(
7184
f"{_JINA_PREFIX}{target_url}",
72-
headers={"Accept": "text/markdown"},
85+
headers=headers,
7386
timeout=_TIMEOUT,
7487
)
7588
if resp.status_code != 200:
89+
logger.warning("read_url upstream HTTP %s: %s", resp.status_code, resp.text[:500])
7690
return json.dumps({
7791
"status": "error",
78-
"error": f"Jina Reader returned {resp.status_code}: {resp.text[:500]}",
92+
"error": f"remote reader returned HTTP {resp.status_code}: {resp.text[:500]}",
7993
}, ensure_ascii=False)
8094

8195
text = resp.text
@@ -88,18 +102,25 @@ def read_url(url: str) -> str:
88102
if len(text) > _MAX_LENGTH:
89103
text = text[:_MAX_LENGTH] + f"\n\n... (truncated, total {len(resp.text)} chars)"
90104

91-
return json.dumps({
105+
result = {
92106
"status": "ok",
93107
"title": title,
94108
"url": target_url,
95109
"content": text,
96110
"length": len(resp.text),
97-
}, ensure_ascii=False)
111+
}
112+
if _CACHED_MARKER in resp.text:
113+
result["cached"] = True
114+
return json.dumps(result, ensure_ascii=False)
98115

99116
except requests.Timeout:
100117
return json.dumps({"status": "error", "error": f"Request timed out ({_TIMEOUT}s)"}, ensure_ascii=False)
101118
except Exception as exc:
102-
return json.dumps({"status": "error", "error": str(exc)}, ensure_ascii=False)
119+
logger.warning("read_url request failed: %s", exc)
120+
return json.dumps(
121+
{"status": "error", "error": f"remote reader request failed: {exc}"},
122+
ensure_ascii=False,
123+
)
103124

104125

105126
class WebReaderTool(BaseTool):
@@ -111,11 +132,12 @@ class WebReaderTool(BaseTool):
111132
"type": "object",
112133
"properties": {
113134
"url": {"type": "string", "description": "URL of the web page to read"},
135+
"no_cache": {"type": "boolean", "description": "Request a fresh (uncached) fetch", "default": False},
114136
},
115137
"required": ["url"],
116138
}
117139
repeatable = True
118140

119141
def execute(self, **kwargs) -> str:
120142
"""Fetch web page."""
121-
return read_url(kwargs["url"])
143+
return read_url(kwargs["url"], no_cache=bool(kwargs.get("no_cache", False)))
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""Regression tests for read_url third-party (Jina) hardening.
2+
3+
Network is mocked; no live r.jina.ai calls. Asserts: HTTP errors surface
4+
the upstream status + body for debugging; a cached snapshot is surfaced
5+
via `cached: true`; `no_cache=True` sends the x-no-cache header while
6+
the default path is byte-identical (no extra header).
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import json
12+
13+
import pytest
14+
15+
import src.tools.web_reader_tool as wr
16+
from src.tools.web_reader_tool import read_url
17+
18+
URL = "https://example.com/page"
19+
20+
21+
class _Resp:
22+
def __init__(self, status_code=200, text=""):
23+
self.status_code = status_code
24+
self.text = text
25+
26+
27+
@pytest.fixture
28+
def captured(monkeypatch):
29+
box = {}
30+
31+
def fake_get(url, headers=None, timeout=None):
32+
box["url"] = url
33+
box["headers"] = headers or {}
34+
r = box["resp"]
35+
if isinstance(r, BaseException):
36+
raise r
37+
return r
38+
39+
monkeypatch.setattr(wr.requests, "get", fake_get)
40+
return box
41+
42+
43+
def test_http_error_surfaces_status_and_body(captured):
44+
captured["resp"] = _Resp(451, "ParamValidationError: bad input")
45+
out = json.loads(read_url(URL))
46+
assert out["status"] == "error"
47+
assert "451" in out["error"]
48+
assert "ParamValidationError: bad input" in out["error"]
49+
50+
51+
def test_exception_error_surfaces_exc_text(captured):
52+
captured["resp"] = RuntimeError("boom: connect failed (10.0.0.1)")
53+
out = json.loads(read_url(URL))
54+
assert out["status"] == "error"
55+
assert "boom: connect failed" in out["error"]
56+
57+
58+
def test_cached_snapshot_is_flagged(captured):
59+
captured["resp"] = _Resp(200, "Title: X\n\nWarning: This is a cached snapshot\n\nbody")
60+
out = json.loads(read_url(URL))
61+
assert out["status"] == "ok"
62+
assert out.get("cached") is True
63+
64+
65+
def test_fresh_response_has_no_cached_key(captured):
66+
captured["resp"] = _Resp(200, "Title: X\n\nlive body content")
67+
out = json.loads(read_url(URL))
68+
assert out["status"] == "ok"
69+
assert "cached" not in out # additive: absent on the normal path
70+
71+
72+
def test_no_cache_header_opt_in_only(captured):
73+
captured["resp"] = _Resp(200, "Title: X\n\nbody")
74+
read_url(URL) # default
75+
assert "x-no-cache" not in {k.lower() for k in captured["headers"]}
76+
read_url(URL, no_cache=True)
77+
assert captured["headers"].get("x-no-cache") == "true"

0 commit comments

Comments
 (0)