Skip to content

Commit 98afcd9

Browse files
committed
Merge remote-tracking branch 'origin/main'
# Conflicts: # agent/src/tools/web_reader_tool.py
2 parents 199f77a + 1b547d2 commit 98afcd9

3 files changed

Lines changed: 122 additions & 11 deletions

File tree

agent/src/skills/web-reader/SKILL.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
---
22
name: web-reader
3-
description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash.
3+
description: Read web pages, articles, and document links by converting URLs into Markdown text. Use the `read_url` tool directly, without bash. Sends the full URL to the third-party Jina Reader (r.jina.ai).
44
category: tool
55
---
66
# Web Reading
@@ -35,10 +35,23 @@ Returns JSON:
3535
## Notes
3636

3737
- Content longer than 8000 characters will be truncated, with the total length noted at the end
38-
- Some websites may block Jina Reader (returning HTTP 451). In that case, fall back to bash + requests
3938
- Dynamically rendered SPA pages may return only skeleton HTML
4039
- Chinese content is supported normally
4140

41+
## Privacy & freshness
42+
43+
- **Third-party dependency:** `read_url` forwards the full target URL
44+
(including any query string) to the external Jina Reader service
45+
(`r.jina.ai`). Do **not** pass URLs containing credentials, tokens, or
46+
private/internal addresses — they would leave this host.
47+
- **Caching/staleness:** results may be a cached snapshot, not live data.
48+
When stale, the JSON includes `"cached": true`; pass `no_cache=true` to
49+
force a fresh fetch (slower — use only when freshness matters).
50+
- **Bash fallback caveat:** if a site blocks the reader (e.g. HTTP 451) a
51+
manual `bash + requests` fetch is possible, but it **bypasses this
52+
tool's URL safety guard and the Jina layer** — use sparingly and never
53+
for internal/authenticated URLs.
54+
4255
## Common Usage
4356

4457
### Read API Documentation

agent/src/tools/web_reader_tool.py

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,20 @@
44

55
import ipaddress
66
import json
7+
import logging
78
from urllib.parse import urlsplit
89

910
import requests
1011

1112
from src.agent.tools import BaseTool
1213
from src.security.scanner import with_security_warnings
1314

15+
logger = logging.getLogger(__name__)
16+
1417
_JINA_PREFIX = "https://r.jina.ai/"
1518
_TIMEOUT = 30
1619
_MAX_LENGTH = 8000
20+
_CACHED_MARKER = "Warning: This is a cached snapshot"
1721

1822

1923
def _url_allowed(url: str) -> tuple[bool, str]:
@@ -53,30 +57,40 @@ def _url_allowed(url: str) -> tuple[bool, str]:
5357
return True, ""
5458

5559

56-
def read_url(url: str) -> str:
60+
def read_url(url: str, no_cache: bool = False) -> str:
5761
"""Fetch web page content via the Jina Reader API.
5862
63+
The full URL (including query string) is sent to the third-party Jina
64+
Reader service (r.jina.ai); never pass credentials/tokens or private
65+
addresses. Results may be a cached snapshot.
66+
5967
Args:
6068
url: Target URL.
69+
no_cache: When true, ask the reader for a fresh (uncached) fetch.
6170
6271
Returns:
63-
JSON-formatted result containing title, content, and url.
72+
JSON result with title, content, url; ``cached: true`` is added
73+
when the reader served a stale snapshot.
6474
"""
6575
target_url = url.strip()
6676
allowed, error = _url_allowed(target_url)
6777
if not allowed:
6878
return json.dumps({"status": "error", "error": error}, ensure_ascii=False)
6979

7080
try:
81+
headers = {"Accept": "text/markdown"}
82+
if no_cache:
83+
headers["x-no-cache"] = "true"
7184
resp = requests.get(
7285
f"{_JINA_PREFIX}{target_url}",
73-
headers={"Accept": "text/markdown"},
86+
headers=headers,
7487
timeout=_TIMEOUT,
7588
)
7689
if resp.status_code != 200:
90+
logger.warning("read_url upstream HTTP %s: %s", resp.status_code, resp.text[:500])
7791
return json.dumps({
7892
"status": "error",
79-
"error": f"Jina Reader returned {resp.status_code}: {resp.text[:500]}",
93+
"error": f"remote reader returned HTTP {resp.status_code}: {resp.text[:500]}",
8094
}, ensure_ascii=False)
8195

8296
text = resp.text
@@ -89,20 +103,26 @@ def read_url(url: str) -> str:
89103
if len(text) > _MAX_LENGTH:
90104
text = text[:_MAX_LENGTH] + f"\n\n... (truncated, total {len(resp.text)} chars)"
91105

92-
payload = {
106+
result = {
93107
"status": "ok",
94108
"title": title,
95109
"url": target_url,
96110
"content": text,
97111
"length": len(resp.text),
98112
}
99-
payload = with_security_warnings(payload, fields=("content",))
100-
return json.dumps(payload, ensure_ascii=False)
113+
if _CACHED_MARKER in resp.text:
114+
result["cached"] = True
115+
result = with_security_warnings(result, fields=("content",))
116+
return json.dumps(result, ensure_ascii=False)
101117

102118
except requests.Timeout:
103119
return json.dumps({"status": "error", "error": f"Request timed out ({_TIMEOUT}s)"}, ensure_ascii=False)
104120
except Exception as exc:
105-
return json.dumps({"status": "error", "error": str(exc)}, ensure_ascii=False)
121+
logger.warning("read_url request failed: %s", exc)
122+
return json.dumps(
123+
{"status": "error", "error": f"remote reader request failed: {exc}"},
124+
ensure_ascii=False,
125+
)
106126

107127

108128
class WebReaderTool(BaseTool):
@@ -114,11 +134,12 @@ class WebReaderTool(BaseTool):
114134
"type": "object",
115135
"properties": {
116136
"url": {"type": "string", "description": "URL of the web page to read"},
137+
"no_cache": {"type": "boolean", "description": "Request a fresh (uncached) fetch", "default": False},
117138
},
118139
"required": ["url"],
119140
}
120141
repeatable = True
121142

122143
def execute(self, **kwargs) -> str:
123144
"""Fetch web page."""
124-
return read_url(kwargs["url"])
145+
return read_url(kwargs["url"], no_cache=bool(kwargs.get("no_cache", False)))
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""Regression tests for read_url third-party (Jina) hardening.
2+
3+
Network is mocked; no live r.jina.ai calls. Asserts: HTTP errors surface
4+
the upstream status + body for debugging; a cached snapshot is surfaced
5+
via `cached: true`; `no_cache=True` sends the x-no-cache header while
6+
the default path is byte-identical (no extra header).
7+
"""
8+
9+
from __future__ import annotations
10+
11+
import json
12+
13+
import pytest
14+
15+
import src.tools.web_reader_tool as wr
16+
from src.tools.web_reader_tool import read_url
17+
18+
URL = "https://example.com/page"
19+
20+
21+
class _Resp:
22+
def __init__(self, status_code=200, text=""):
23+
self.status_code = status_code
24+
self.text = text
25+
26+
27+
@pytest.fixture
28+
def captured(monkeypatch):
29+
box = {}
30+
31+
def fake_get(url, headers=None, timeout=None):
32+
box["url"] = url
33+
box["headers"] = headers or {}
34+
r = box["resp"]
35+
if isinstance(r, BaseException):
36+
raise r
37+
return r
38+
39+
monkeypatch.setattr(wr.requests, "get", fake_get)
40+
return box
41+
42+
43+
def test_http_error_surfaces_status_and_body(captured):
44+
captured["resp"] = _Resp(451, "ParamValidationError: bad input")
45+
out = json.loads(read_url(URL))
46+
assert out["status"] == "error"
47+
assert "451" in out["error"]
48+
assert "ParamValidationError: bad input" in out["error"]
49+
50+
51+
def test_exception_error_surfaces_exc_text(captured):
52+
captured["resp"] = RuntimeError("boom: connect failed (10.0.0.1)")
53+
out = json.loads(read_url(URL))
54+
assert out["status"] == "error"
55+
assert "boom: connect failed" in out["error"]
56+
57+
58+
def test_cached_snapshot_is_flagged(captured):
59+
captured["resp"] = _Resp(200, "Title: X\n\nWarning: This is a cached snapshot\n\nbody")
60+
out = json.loads(read_url(URL))
61+
assert out["status"] == "ok"
62+
assert out.get("cached") is True
63+
64+
65+
def test_fresh_response_has_no_cached_key(captured):
66+
captured["resp"] = _Resp(200, "Title: X\n\nlive body content")
67+
out = json.loads(read_url(URL))
68+
assert out["status"] == "ok"
69+
assert "cached" not in out # additive: absent on the normal path
70+
71+
72+
def test_no_cache_header_opt_in_only(captured):
73+
captured["resp"] = _Resp(200, "Title: X\n\nbody")
74+
read_url(URL) # default
75+
assert "x-no-cache" not in {k.lower() for k in captured["headers"]}
76+
read_url(URL, no_cache=True)
77+
assert captured["headers"].get("x-no-cache") == "true"

0 commit comments

Comments
 (0)