Skip to content

Commit b95bc5d

Browse files
committed
Add ArXiv search MCP tool with tests and dependency
Add ArxivSearchTool as a first-party MCP server for scientific paper retrieval via the free arXiv API (no API key required). Includes: - New nemo_skills/mcp/servers/arxiv_tool.py with arxiv-search and arxiv-get tools, rate limiting, retry logic, and response caching - Declare `arxiv` dependency in tools/requirements.txt - CI: install [tools] extra so MCP tool deps are available in tests - Tests: config validation, offline stdio subprocess integration (list_tools, hide_args), invalid-id handling, and live API tests - Docs: add ArxivSearchTool to built-in tools reference Signed-off-by: tamohannes <hovhannes.tamoyan@gmail.com> Made-with: Cursor
1 parent 94478e5 commit b95bc5d

5 files changed

Lines changed: 270 additions & 1 deletion

File tree

.github/workflows/tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
- name: Install dependencies
4545
run: |
4646
python -m pip install --upgrade pip
47-
pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/cpu
47+
pip install -e ".[dev,tools]" --extra-index-url https://download.pytorch.org/whl/cpu
4848
# Clear pip cache
4949
pip cache purge || true
5050
- name: Build Images

docs/agentic_inference/tool_calling.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,4 +370,5 @@ For vLLM, you may need to specify tool calling arguments:
370370
### Built-in Tools
371371

372372
- [`nemo_skills.mcp.servers.python_tool.PythonTool`](https://github.com/NVIDIA-NeMo/Skills/tree/main/nemo_skills/mcp/servers/python_tool.py) - Python code execution
373+
- [`nemo_skills.mcp.servers.arxiv_tool.ArxivSearchTool`](https://github.com/NVIDIA-NeMo/Skills/tree/main/nemo_skills/mcp/servers/arxiv_tool.py) - ArXiv paper search and retrieval (no API key required)
373374
- [`nemo_skills.mcp.servers.exa_tool.ExaTool`](https://github.com/NVIDIA-NeMo/Skills/tree/main/nemo_skills/mcp/servers/exa_tool.py) - Web search via Exa API
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""ArXiv search MCP tool for scientific paper retrieval.
16+
17+
Runs outside the sandbox (no network blocking). Uses the free arXiv API
18+
via the `arxiv` pip package. No API key required.
19+
20+
Prerequisites:
21+
pip install arxiv
22+
23+
Usage:
24+
++tool_modules=[nemo_skills.mcp.servers.arxiv_tool::ArxivSearchTool]
25+
"""
26+
27+
import hashlib
28+
import json
29+
import logging
30+
import time
31+
from threading import Lock
32+
from typing import Annotated
33+
34+
from mcp.server.fastmcp import FastMCP
35+
from pydantic import Field
36+
37+
from nemo_skills.mcp.tool_providers import MCPClientTool
38+
39+
logger = logging.getLogger(__name__)
40+
41+
mcp = FastMCP(name="arxiv")
42+
43+
MAX_RESULTS = 10
44+
_REQUEST_INTERVAL = 3.0
45+
_NUM_RETRIES = 5
46+
_INITIAL_DELAY = 5.0
47+
_MAX_DELAY = 60.0
48+
_CACHE_MAX_SIZE = 256
49+
50+
_last_request_time = 0.0
51+
_rate_lock = Lock()
52+
_cache: dict[str, str] = {}
53+
54+
55+
def _rate_limit():
56+
"""Enforce minimum 3-second gap between ArXiv API calls."""
57+
global _last_request_time
58+
with _rate_lock:
59+
now = time.monotonic()
60+
wait = _REQUEST_INTERVAL - (now - _last_request_time)
61+
if wait > 0:
62+
time.sleep(wait)
63+
_last_request_time = time.monotonic()
64+
65+
66+
def _cache_key(*args) -> str:
67+
return hashlib.sha256(json.dumps(args, sort_keys=True).encode()).hexdigest()
68+
69+
70+
def _with_retry(fn):
71+
"""Execute fn with exponential backoff. Rate-limits each attempt."""
72+
delay = _INITIAL_DELAY
73+
for attempt in range(_NUM_RETRIES + 1):
74+
try:
75+
_rate_limit()
76+
return fn()
77+
except Exception as e:
78+
if attempt == _NUM_RETRIES:
79+
raise
80+
logger.warning(
81+
"ArXiv attempt %d/%d failed: %s — retrying in %.0fs",
82+
attempt + 1,
83+
_NUM_RETRIES + 1,
84+
e,
85+
delay,
86+
)
87+
time.sleep(delay)
88+
delay = min(delay * 2, _MAX_DELAY)
89+
90+
91+
@mcp.tool(name="arxiv-search")
92+
def arxiv_search(
93+
query: Annotated[
94+
str, Field(description="Search query for arXiv papers (supports arXiv query syntax: au:, ti:, abs:, cat:).")
95+
],
96+
max_results: Annotated[int, Field(description="Maximum number of results to return.")] = 3,
97+
) -> str:
98+
"""Search arXiv for scientific papers. Returns titles, abstracts, and URLs."""
99+
import arxiv
100+
101+
if max_results > MAX_RESULTS:
102+
max_results = MAX_RESULTS
103+
104+
key = _cache_key("search", query, max_results)
105+
if key in _cache:
106+
return _cache[key]
107+
108+
def _fetch():
109+
client = arxiv.Client(page_size=max_results, num_retries=1, delay_seconds=0)
110+
search = arxiv.Search(
111+
query=query,
112+
max_results=max_results,
113+
sort_by=arxiv.SortCriterion.Relevance,
114+
)
115+
results = []
116+
for paper in client.results(search):
117+
results.append(
118+
f"**{paper.title}**\n"
119+
f"Authors: {', '.join(a.name for a in paper.authors[:5])}"
120+
f"{'...' if len(paper.authors) > 5 else ''}\n"
121+
f"Published: {paper.published.strftime('%Y-%m-%d')}\n"
122+
f"URL: {paper.entry_id}\n"
123+
f"Abstract: {paper.summary[:500]}{'...' if len(paper.summary) > 500 else ''}\n"
124+
)
125+
return results
126+
127+
try:
128+
results = _with_retry(_fetch)
129+
if not results:
130+
return "No papers found for this query."
131+
result_str = "\n---\n".join(results)
132+
if len(_cache) < _CACHE_MAX_SIZE:
133+
_cache[key] = result_str
134+
return result_str
135+
except Exception as e:
136+
return f"ArXiv search failed: {e}"
137+
138+
139+
@mcp.tool(name="arxiv-get")
140+
def arxiv_get(
141+
paper_id: Annotated[str, Field(description="arXiv paper ID (e.g. '2301.07041' or '2301.07041v1').")],
142+
) -> str:
143+
"""Fetch a specific arXiv paper by ID. Returns full title, authors, abstract, and metadata."""
144+
import arxiv
145+
146+
key = _cache_key("get", paper_id)
147+
if key in _cache:
148+
return _cache[key]
149+
150+
def _fetch():
151+
client = arxiv.Client(page_size=1, num_retries=1, delay_seconds=0)
152+
search = arxiv.Search(id_list=[paper_id])
153+
return next(client.results(search), None)
154+
155+
try:
156+
paper = _with_retry(_fetch)
157+
if paper is None:
158+
return f"Paper {paper_id} not found on arXiv."
159+
result_str = (
160+
f"**{paper.title}**\n"
161+
f"Authors: {', '.join(a.name for a in paper.authors)}\n"
162+
f"Published: {paper.published.strftime('%Y-%m-%d')}\n"
163+
f"Updated: {paper.updated.strftime('%Y-%m-%d')}\n"
164+
f"Categories: {', '.join(paper.categories)}\n"
165+
f"URL: {paper.entry_id}\n"
166+
f"PDF: {paper.pdf_url}\n\n"
167+
f"Abstract:\n{paper.summary}"
168+
)
169+
if len(_cache) < _CACHE_MAX_SIZE:
170+
_cache[key] = result_str
171+
return result_str
172+
except Exception as e:
173+
return f"ArXiv lookup failed: {e}"
174+
175+
176+
class ArxivSearchTool(MCPClientTool):
177+
def __init__(self) -> None:
178+
super().__init__()
179+
self.apply_config_updates(
180+
{
181+
"client": "nemo_skills.mcp.clients.MCPStdioClient",
182+
"client_params": {
183+
"command": "python",
184+
"args": ["-m", "nemo_skills.mcp.servers.arxiv_tool"],
185+
},
186+
"hide_args": {
187+
"arxiv-search": ["max_results"],
188+
},
189+
}
190+
)
191+
192+
193+
def main():
194+
mcp.run(transport="stdio")
195+
196+
197+
if __name__ == "__main__":
198+
main()

tests/test_mcp_clients.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,3 +1002,72 @@ async def failing_delete(session_id):
10021002
# Must not raise; session must be removed from the mapping regardless.
10031003
await tool.cleanup_request("req-x")
10041004
assert "req-x" not in tool.requests_to_sessions
1005+
1006+
1007+
# ── ArXiv tool tests ─────────────────────────────────────────────────────
1008+
1009+
1010+
class TestArxivTool:
1011+
def test_arxiv_tool_config(self):
1012+
from nemo_skills.mcp.servers.arxiv_tool import ArxivSearchTool
1013+
1014+
tool = ArxivSearchTool()
1015+
assert tool._config["client"] == "nemo_skills.mcp.clients.MCPStdioClient"
1016+
assert "-m" in tool._config["client_params"]["args"]
1017+
assert "nemo_skills.mcp.servers.arxiv_tool" in tool._config["client_params"]["args"]
1018+
1019+
@pytest.mark.live
1020+
def test_arxiv_search_live(self):
1021+
from nemo_skills.mcp.servers.arxiv_tool import arxiv_search
1022+
1023+
result = arxiv_search("quantum entanglement", max_results=2)
1024+
assert "Error" not in result
1025+
assert "**" in result
1026+
1027+
@pytest.mark.live
1028+
def test_arxiv_get_live(self):
1029+
from nemo_skills.mcp.servers.arxiv_tool import arxiv_get
1030+
1031+
result = arxiv_get("2301.07041")
1032+
assert "not found" not in result.lower()
1033+
assert "Abstract" in result
1034+
1035+
def test_arxiv_get_invalid_id(self):
1036+
from nemo_skills.mcp.servers.arxiv_tool import arxiv_get
1037+
1038+
result = arxiv_get("0000.00000")
1039+
assert "not found" in result.lower() or "failed" in result.lower()
1040+
1041+
@pytest.mark.asyncio
1042+
async def test_arxiv_stdio_list_tools(self):
1043+
"""Launch ArxivSearchTool over a real stdio subprocess and verify tool listing."""
1044+
from nemo_skills.mcp.servers.arxiv_tool import ArxivSearchTool
1045+
1046+
tool = ArxivSearchTool()
1047+
tool.configure()
1048+
1049+
tools = await tool.list_tools()
1050+
tool_names = {t["name"] for t in tools}
1051+
assert "arxiv-search" in tool_names
1052+
assert "arxiv-get" in tool_names
1053+
1054+
await tool.shutdown()
1055+
1056+
@pytest.mark.asyncio
1057+
async def test_arxiv_stdio_hide_args(self):
1058+
"""Verify hide_args removes max_results from the listed schema."""
1059+
from nemo_skills.mcp.servers.arxiv_tool import ArxivSearchTool
1060+
1061+
tool = ArxivSearchTool()
1062+
tool.configure()
1063+
1064+
tools = await tool.list_tools()
1065+
search_tool = next(t for t in tools if t["name"] == "arxiv-search")
1066+
schema_props = search_tool["input_schema"]["properties"]
1067+
assert "query" in schema_props
1068+
assert "max_results" not in schema_props
1069+
1070+
get_tool = next(t for t in tools if t["name"] == "arxiv-get")
1071+
assert "paper_id" in get_tool["input_schema"]["properties"]
1072+
1073+
await tool.shutdown()

tools/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
# import time by the current implementation, but it excludes model-client
66
# dependencies such as litellm and openai.
77

8+
arxiv
89
httpx
910
math-verify[antlr4_9_3]
1011
mcp

0 commit comments

Comments
 (0)