Skip to content

Commit 4e04b09

Browse files
committed
feat(v2 agents): wire OAM-CH RAG provider as LLM/hybrid agent tool
Make the OAM-CH index a first-class RAG source for the v2 LLM agents (repository, article, context-summary scout) so they can look up journals, publications, publishers and Swiss organisations during extraction without hand-rolling another HTTP client. - `src/v2/ingest/providers/oamonitor_rag.py` — `OamonitorRagProvider` mirroring the existing zenodo/openalex/huggingface pattern. Async `search(query, entity_type, top_k, filters, rerank)` against the per-entity Qdrant collections, gated by allowlisted filter keys and reusing OpenAlex's RCP embedder + reranker via duck typing. `fetch_records(ids, entity_type)` hydrates full rows back out of DuckDB by upstream `_id`. `build_default_provider()` gated on `V2_OAMONITOR_RAG_ENABLED` (default on). - `src/v2/agents/llm/agent_tools/oamonitor_rag.py` — pydantic-ai `Tool` factories: `search_oamonitor_rag` (entity-typed semantic search) and `fetch_oamonitor_records` (id-based hydration). - `src/v2/dependencies.py` — `_resolve_oamonitor_rag_provider` + caching on `app.state.v2_oamonitor_rag_provider`, threaded through the `ProviderSet` builder so every v2 request gets the provider (or `None` when the index is missing — degrades silently). - `src/v2/agents/models.py` — `ProviderSet.oamonitor_rag` field. - `src/v2/agents/llm/article/agent.py` and `src/v2/agents/llm/context_summary/agent.py` — register the new `search_oamonitor_rag` (and `fetch_oamonitor_records` for the article agent) when the provider is available. - `src/v2/agents/llm/context_summary/prompts/system_prompt_scout.md` — short entry telling the scout when to reach for OAM-CH (journal ISSN+OA color, publisher OA policy, Swiss institution lookups). Smoke-tested against the populated index (25 285 journals + 99 331 EPFL-affiliated publications + 2 452 publishers + 59 organisations): `POST /v2/indices/oamonitor/search` resolves CV-themed queries to real EPFL papers via the new tool path.
1 parent 8b67af3 commit 4e04b09

8 files changed

Lines changed: 403 additions & 0 deletions

File tree

src/v2/agents/llm/agent_tools/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,10 @@
7979
generate_uuid_v4_batch_tool,
8080
generate_uuid_v4_tool,
8181
)
82+
from src.v2.agents.llm.agent_tools.oamonitor_rag import (
83+
make_oamonitor_rag_fetch_records_tool,
84+
make_oamonitor_rag_search_tool,
85+
)
8286
from src.v2.agents.llm.agent_tools.zenodo_rag import (
8387
make_zenodo_rag_fetch_records_tool,
8488
make_zenodo_rag_search_tool,
@@ -110,6 +114,8 @@
110114
"make_infoscience_rag_fetch_records_tool",
111115
"make_infoscience_rag_search_tool",
112116
"make_infoscience_search_tool",
117+
"make_oamonitor_rag_fetch_records_tool",
118+
"make_oamonitor_rag_search_tool",
113119
"make_openalex_rag_search_tool",
114120
"make_orcid_person_tool",
115121
"make_orcid_rag_search_tool",
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""Pydantic-AI Tools backed by :class:`OamonitorRagProvider`.
2+
3+
Wraps the OAM-CH (Open Access Monitor — Switzerland) per-entity RAG
4+
search and DuckDB hydration into pydantic-ai ``Tool`` factories. The
5+
search tool accepts a free-text ``query`` plus an ``entity_type``
6+
selector (``journals`` | ``publications`` | ``publishers`` |
7+
``organisations``); the fetch tool hydrates full records from the local
8+
DuckDB by ``_id``.
9+
"""
10+
11+
from __future__ import annotations
12+
13+
import logging
14+
from typing import TYPE_CHECKING, Any
15+
16+
from pydantic_ai import Tool
17+
18+
from src.v2.observation.query_log import record_query
19+
20+
if TYPE_CHECKING:
21+
from src.v2.ingest.providers.oamonitor_rag import OamonitorRagProvider
22+
23+
logger = logging.getLogger(__name__)
24+
25+
_SEARCH_DESCRIPTION = (
26+
"Semantic search over the Open Access Monitor CH (OAM-CH) index — the "
27+
"Swiss aggregator of journals, publications, publishers and organisations "
28+
"with Open Access status (gold / green / hybrid / closed) per entity. "
29+
"Use this to resolve a journal title to its ISSN + OA color, to look up "
30+
"a publication's OA status by content match, to identify the OA policy of "
31+
"a publisher, or to look up Swiss institutions catalogued in OAM. "
32+
"`entity_type` (REQUIRED) picks the collection: 'journals' (default), "
33+
"'publications', 'publishers', or 'organisations'. `filters`: optional "
34+
"allowlist dict ({entity_type, entity_id}); each value may be scalar or "
35+
"list. `rerank=true` engages the cross-encoder against the embedding text "
36+
"— useful for fine title disambiguation. Returns thin hits with "
37+
"entity_type, entity_id, embedding_text."
38+
)
39+
40+
_FETCH_RECORDS_DESCRIPTION = (
41+
"Hydrate one or more OAM-CH records by their upstream `_id`. "
42+
"`entity_type` (REQUIRED) selects the table: 'journals' (string id), "
43+
"'publications' (OpenAlex URL id), 'publishers' (slug), 'organisations' "
44+
"(ROR URL). Returns the full raw upstream payload (`raw`) per id — use "
45+
"after `search_oamonitor_rag` when you need fields like ISSNs, DOI, "
46+
"publisher name, country code, OA color etc. that the thin search hit "
47+
"omits."
48+
)
49+
50+
51+
def make_oamonitor_rag_search_tool(provider: OamonitorRagProvider) -> Tool:
52+
"""Tool factory: semantic search over the OAM-CH per-entity collections."""
53+
54+
async def search_oamonitor_rag(
55+
query: str,
56+
entity_type: str = "journals",
57+
top_k: int = 10,
58+
filters: dict[str, Any] | None = None,
59+
rerank: bool = False, # noqa: FBT001, FBT002 — part of the LLM tool signature
60+
) -> list[dict[str, Any]]:
61+
"""Vector search the OAM-CH index. See tool description."""
62+
logger.info(
63+
"tool call: search_oamonitor_rag — entity=%s top_k=%d rerank=%s "
64+
"filters=%r query=%r",
65+
entity_type, top_k, rerank, filters, query,
66+
)
67+
record_query(service="oamonitor.rag.search", query=query)
68+
return await provider.search(
69+
query,
70+
entity_type=entity_type,
71+
top_k=top_k,
72+
filters=filters,
73+
rerank=rerank,
74+
)
75+
76+
return Tool(
77+
search_oamonitor_rag,
78+
name="search_oamonitor_rag",
79+
description=_SEARCH_DESCRIPTION,
80+
)
81+
82+
83+
def make_oamonitor_rag_fetch_records_tool(provider: OamonitorRagProvider) -> Tool:
84+
"""Tool factory: hydrate OAM-CH records by `_id` for a given entity table."""
85+
86+
async def fetch_oamonitor_records(
87+
ids: list[str],
88+
entity_type: str,
89+
) -> list[dict[str, Any]]:
90+
"""Fetch full OAM-CH records by id. See tool description."""
91+
logger.info(
92+
"tool call: fetch_oamonitor_records — entity=%s ids=%d",
93+
entity_type, len(ids),
94+
)
95+
record_query(
96+
service="oamonitor.rag.fetch_records",
97+
query=",".join(str(i) for i in ids[:10]),
98+
)
99+
return await provider.fetch_records(ids, entity_type=entity_type)
100+
101+
return Tool(
102+
fetch_oamonitor_records,
103+
name="fetch_oamonitor_records",
104+
description=_FETCH_RECORDS_DESCRIPTION,
105+
)
106+
107+
108+
__all__ = [
109+
"make_oamonitor_rag_fetch_records_tool",
110+
"make_oamonitor_rag_search_tool",
111+
]

src/v2/agents/llm/article/agent.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@
3838
from src.v2.agents.llm.agent_tools.selenium_fetch import (
3939
make_fetch_link_content_tool,
4040
)
41+
from src.v2.agents.llm.agent_tools.oamonitor_rag import (
42+
make_oamonitor_rag_fetch_records_tool,
43+
make_oamonitor_rag_search_tool,
44+
)
4145
from src.v2.agents.llm.agent_tools.swissubase_rag import (
4246
make_swissubase_rag_search_tool,
4347
)
@@ -317,6 +321,9 @@ async def run(
317321
if providers.zenodo_rag is not None:
318322
tools.append(make_zenodo_rag_search_tool(providers.zenodo_rag))
319323
tools.append(make_zenodo_rag_fetch_records_tool(providers.zenodo_rag))
324+
if providers.oamonitor_rag is not None:
325+
tools.append(make_oamonitor_rag_search_tool(providers.oamonitor_rag))
326+
tools.append(make_oamonitor_rag_fetch_records_tool(providers.oamonitor_rag))
320327
if providers.renkulab_rag is not None:
321328
tools.append(make_renkulab_rag_search_tool(providers.renkulab_rag))
322329
if providers.swissubase_rag is not None:

src/v2/agents/llm/context_summary/agent.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,9 @@ def _build_scout_tools(
272272
from src.v2.agents.llm.agent_tools.huggingface_rag import ( # noqa: PLC0415
273273
make_huggingface_rag_search_tool,
274274
)
275+
from src.v2.agents.llm.agent_tools.oamonitor_rag import ( # noqa: PLC0415
276+
make_oamonitor_rag_search_tool,
277+
)
275278
from src.v2.agents.llm.agent_tools.infoscience_rag import ( # noqa: PLC0415
276279
make_infoscience_rag_search_tool,
277280
)
@@ -313,6 +316,8 @@ def _build_scout_tools(
313316
tools.append(make_openalex_rag_search_tool(providers.openalex_rag))
314317
if providers.zenodo_rag is not None:
315318
tools.append(make_zenodo_rag_search_tool(providers.zenodo_rag))
319+
if providers.oamonitor_rag is not None:
320+
tools.append(make_oamonitor_rag_search_tool(providers.oamonitor_rag))
316321
if providers.ethz_research_collection_rag is not None:
317322
tools.append(
318323
make_ethz_research_collection_rag_search_tool(

src/v2/agents/llm/context_summary/prompts/system_prompt_scout.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@ Spend it generously here so per-entity agents don't have to:
3232
papers that cite or describe the repo (CITATION.cff
3333
`preferred-citation`, .zenodo.json `related_identifiers`, README
3434
references).
35+
- `search_oamonitor_rag` — Open Access Monitor CH index. Use to
36+
resolve a journal title to its ISSN + OA color, identify a
37+
publisher's OA policy, or pin a Swiss institution (`entity_type`:
38+
`journals` | `publications` | `publishers` | `organisations`).
3539
- `fetch_link_content_via_selenium` — sparingly, to verify a project
3640
homepage or a lab page when other signals are weak.
3741
- `search_on_the_internet` (DuckDuckGo) — last-resort confirmation only.

src/v2/agents/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ class ProviderSet:
177177
huggingface_rag: Any | None = None
178178
openalex_rag: Any | None = None
179179
zenodo_rag: Any | None = None
180+
oamonitor_rag: Any | None = None
180181
orcid_rag: Any | None = None
181182
ror_rag: Any | None = None
182183
snsf_rag: Any | None = None

src/v2/dependencies.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@
9696
from src.v2.ingest.providers.swissubase_rag import (
9797
build_default_provider as build_default_swissubase_rag_provider,
9898
)
99+
from src.v2.ingest.providers.oamonitor_rag import (
100+
OamonitorRagProvider,
101+
)
102+
from src.v2.ingest.providers.oamonitor_rag import (
103+
build_default_provider as build_default_oamonitor_rag_provider,
104+
)
99105
from src.v2.ingest.providers.zenodo_rag import (
100106
ZenodoRagProvider,
101107
)
@@ -209,6 +215,16 @@ def _resolve_zenodo_rag_provider(app_state: Any) -> ZenodoRagProvider | None:
209215
)
210216

211217

218+
def _resolve_oamonitor_rag_provider(app_state: Any) -> OamonitorRagProvider | None:
219+
return _resolve_rag_provider(
220+
app_state,
221+
state_attr="v2_oamonitor_rag_provider",
222+
env_var="V2_OAMONITOR_RAG_ENABLED",
223+
builder=build_default_oamonitor_rag_provider,
224+
expected_type=OamonitorRagProvider,
225+
)
226+
227+
212228
def _resolve_github_rag_provider(app_state: Any) -> GitHubRagProvider | None:
213229
return _resolve_rag_provider(
214230
app_state,
@@ -305,6 +321,7 @@ def _default_provider_set( # noqa: PLR0913 — bundle-builder for ProviderSet
305321
huggingface_rag: HuggingFaceRagProvider | None = None,
306322
openalex_rag: OpenAlexRagProvider | None = None,
307323
zenodo_rag: ZenodoRagProvider | None = None,
324+
oamonitor_rag: OamonitorRagProvider | None = None,
308325
orcid_rag: OrcidRagProvider | None = None,
309326
ror_rag: RorRagProvider | None = None,
310327
snsf_rag: SnsfRagProvider | None = None,
@@ -320,6 +337,7 @@ def _default_provider_set( # noqa: PLR0913 — bundle-builder for ProviderSet
320337
"huggingface_rag": huggingface_rag,
321338
"openalex_rag": openalex_rag,
322339
"zenodo_rag": zenodo_rag,
340+
"oamonitor_rag": oamonitor_rag,
323341
"orcid_rag": orcid_rag,
324342
"ror_rag": ror_rag,
325343
"snsf_rag": snsf_rag,
@@ -408,6 +426,7 @@ async def get_provider_set(request: Request) -> ProviderSet:
408426
huggingface_rag=_resolve_huggingface_rag_provider(app_state),
409427
openalex_rag=_resolve_openalex_rag_provider(app_state),
410428
zenodo_rag=_resolve_zenodo_rag_provider(app_state),
429+
oamonitor_rag=_resolve_oamonitor_rag_provider(app_state),
411430
orcid_rag=_resolve_orcid_rag_provider(app_state),
412431
ror_rag=_resolve_ror_rag_provider(app_state),
413432
snsf_rag=_resolve_snsf_rag_provider(app_state),
@@ -437,6 +456,7 @@ async def get_provider_set(request: Request) -> ProviderSet:
437456
huggingface_rag=default_provider_set.huggingface_rag,
438457
openalex_rag=default_provider_set.openalex_rag,
439458
zenodo_rag=default_provider_set.zenodo_rag,
459+
oamonitor_rag=default_provider_set.oamonitor_rag,
440460
orcid_rag=default_provider_set.orcid_rag,
441461
ror_rag=default_provider_set.ror_rag,
442462
snsf_rag=default_provider_set.snsf_rag,

0 commit comments

Comments
 (0)