Skip to content

Commit 4fe5ba9

Browse files
authored
Merge pull request #40 from Imaging-Plaza/feat/scout-usage-limits-bump
Feat/scout usage limits bump
2 parents d0783b9 + 2ff9b6f commit 4fe5ba9

33 files changed

Lines changed: 1183 additions & 116 deletions

.env.example

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,24 @@ OPENROUTER_API_KEY=your-openrouter-key-here
287287
# reasoning about the repo it is processing.
288288
# V2_GITHUB_RAG_ENABLED=true
289289

290+
# Auto-ingest every newly seen PUBLIC repository into the GitHub RAG
291+
# index after a successful /v2/extract call. Off by default so existing
292+
# deployments don't change behaviour. When `true`, an async background
293+
# task fires after each completed repository extraction:
294+
#
295+
# 1. Skip if the repo is already in the GitHub DuckDB (no refresh).
296+
# 2. Fetch its metadata + README via the GitHub REST API.
297+
# 3. Persist the row to DuckDB and embed the README chunks into the
298+
# `github_repos` Qdrant collection.
299+
# 4. Private/unreachable repos surface as `skipped_404` and emit one
300+
# log line — no caller-visible error.
301+
#
302+
# This grows the GitHub RAG index organically as repos are processed,
303+
# so subsequent extractions find them via `search_github_rag` without
304+
# manual cli ingest runs. Module-level lock serialises DuckDB writes
305+
# across uvicorn workers, so it's safe with `WORKERS=4` or higher.
306+
# V2_GITHUB_RAG_AUTO_INGEST=false
307+
290308

291309
EPFL_GRAPH_USERNAME=
292310
EPFL_GRAPH_PASSWORD=

src/v2/agents/llm/context_summary/agent.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,32 @@
1515
from src.v2.agents.llm.agent_tools.repository_corpus_grep import (
1616
make_repository_corpus_grep_tool,
1717
)
18+
from pydantic_ai.usage import UsageLimits
19+
1820
from src.v2.agents.models import AgentResult, ProviderSet
1921
from src.v2.agents.llm.runtime import LLMRuntimeError, V2LLMRuntime
2022
from src.v2.ingest.cache import ProviderCache
2123
from src.v2.observation.query_log import stamp_current_agent
2224

2325
logger = logging.getLogger(__name__)
2426

27+
# Scout-mode usage budget. The scout prompt explicitly tells the LLM to
28+
# spend ~20 tool calls up-front so per-entity agents don't have to; the
29+
# global `V2_LLM_REQUEST_LIMIT` default of 25 (set in
30+
# `src/v2/agents/llm/runtime.py`) caps it before it finishes and the
31+
# brief comes back empty, leaving downstream agents to re-discover every
32+
# ORCID / ROR / DOI themselves. Override locally so scout gets the
33+
# budget it was designed for. Per-call request count tracked by
34+
# pydantic-ai; tool-call budget gates the actual external lookups.
35+
#
36+
# TODO: lift these into a general agent-tuning config file (alongside
37+
# `V2_LLM_REQUEST_LIMIT` / `V2_LLM_TOOL_CALLS_LIMIT`) so every agent can
38+
# declare its own budget without touching code. Today the two knobs are
39+
# global env vars + this single per-agent override; a YAML/TOML config
40+
# keyed by agent name would scale better as more agents need tuning.
41+
_SCOUT_REQUEST_LIMIT = 60
42+
_SCOUT_TOOL_CALLS_LIMIT = 120
43+
2544
_PROMPTS_PACKAGE = "src.v2.agents.llm.context_summary.prompts"
2645
_SYSTEM_PROMPT = load_prompt(_PROMPTS_PACKAGE, "system_prompt.md")
2746
_USER_PROMPT_TEMPLATE = load_prompt(_PROMPTS_PACKAGE, "user_prompt.md")
@@ -416,16 +435,24 @@ async def run(
416435
# is unchanged (2 tools, original prompt). Scout mode adds the
417436
# RAG providers' search tools so people / orgs / articles get
418437
# recon'd up-front.
438+
usage_limits: UsageLimits | None = None
419439
if scout_mode:
420440
system_prompt = load_prompt(_PROMPTS_PACKAGE, _SCOUT_PROMPT_FILENAME)
421441
tools = _build_scout_tools(
422442
providers=providers,
423443
cache=self._cache,
424444
corpus_documents=corpus_documents,
425445
)
446+
usage_limits = UsageLimits(
447+
request_limit=_SCOUT_REQUEST_LIMIT,
448+
tool_calls_limit=_SCOUT_TOOL_CALLS_LIMIT,
449+
)
426450
logger.info(
427-
"context_summary_agent: scout mode ON (%d tools)",
451+
"context_summary_agent: scout mode ON (%d tools, "
452+
"request_limit=%d, tool_calls_limit=%d)",
428453
len(tools),
454+
_SCOUT_REQUEST_LIMIT,
455+
_SCOUT_TOOL_CALLS_LIMIT,
429456
)
430457
else:
431458
system_prompt = _SYSTEM_PROMPT
@@ -435,11 +462,15 @@ async def run(
435462
]
436463

437464
try:
465+
run_kwargs: dict[str, Any] = {}
466+
if usage_limits is not None:
467+
run_kwargs["usage_limits"] = usage_limits
438468
llm_result = await self._llm_runtime.run_json_prompt(
439469
system_prompt=system_prompt,
440470
user_prompt=user_prompt,
441471
output_type=LLMContextSummaryOutput,
442472
tools=tools,
473+
**run_kwargs,
443474
)
444475
except LLMRuntimeError as exc:
445476
warning = f"context_summary_agent: LLM call failed; proceeding without compiled summary ({exc})"

src/v2/agents/llm/contribution/agent.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,15 @@ async def run(
271271

272272
payload["schema:author"] = target_person_id
273273
payload["pulse:contributionTo"] = target_repository_id
274-
composite_id = f"{target_person_id}_{target_repository_id}"
274+
# Double underscore separator — single `_` is ambiguous because
275+
# GitHub usernames are allowed to contain `_` (and URLs in the
276+
# repo id contain `/`), so `<person_url>_<repo_url>` cannot be
277+
# parsed back unambiguously. `__` never appears in either a
278+
# github.com URL or a ROR identifier, so the composite is
279+
# round-trippable. Migration: any previously persisted edges
280+
# with single-`_` separator are still readable by consumers —
281+
# only newly emitted edges adopt the new shape.
282+
composite_id = f"{target_person_id}__{target_repository_id}"
275283
payload["id"] = composite_id
276284
payload["idSource"] = "pulse:composite"
277285
identifiers = payload.get("identifiers")
@@ -302,6 +310,44 @@ async def run(
302310

303311
force_server_uuid(payload, uuid_value)
304312

313+
# Drop empty edges. Production audit observed 28 contributions
314+
# landing with `pulse:contributionCount=0` and both date fields
315+
# `null` — vacuous edges that bloat the graph without carrying
316+
# any signal. They arise when the LLM emits a Contribution from
317+
# a weak signal (a comment, a watch, a review) but the GitHub
318+
# contributor record never produced a commit count or date.
319+
contribution_count = payload.get("pulse:contributionCount")
320+
first_contribution_date = payload.get("pulse:firstContributionDate")
321+
last_contribution_date = payload.get("pulse:lastContributionDate")
322+
if (
323+
(contribution_count is None or contribution_count == 0)
324+
and not first_contribution_date
325+
and not last_contribution_date
326+
):
327+
warning = (
328+
f"llm_contribution: dropping Contribution {payload.get('id')!r} — "
329+
"no count, no firstContributionDate, no lastContributionDate"
330+
)
331+
logger.info(warning)
332+
return AgentResult(
333+
data={},
334+
warnings=[warning],
335+
raw_output={},
336+
model=llm_result.model,
337+
provider=llm_result.provider,
338+
tokens_prompt=llm_result.tokens_prompt,
339+
tokens_completion=llm_result.tokens_completion,
340+
stats={
341+
"agent_runtime": "llm",
342+
"contributions": [],
343+
"contribution_count": 0,
344+
"derivation": {
345+
"contribution_seed": contribution_seed,
346+
"skipped_reason": "empty_edge",
347+
},
348+
},
349+
)
350+
305351
raw_output = deepcopy(payload)
306352
validation_warnings = _strict_validate(payload)
307353

src/v2/agents/llm/contribution/prompts/system_prompt.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Return exactly one JSON object for a `pulse:Contribution` entity conforming to `
55
Output only JSON. No markdown fences. No explanations.
66

77
Required fields:
8-
- `id` (composite `personId_repoId` or UUID fallback)
8+
- `id` (composite `personId__repoId` — DOUBLE underscore — or UUID fallback)
99
- `type` = `"pulse:Contribution"`
1010
- `shacl` = `"pulse:ContributionShape"`
1111
- `identifiers` with `pulse:composite` and `uuid`
@@ -24,9 +24,13 @@ Rules:
2424
`schema:author` to `target_person.id` and `pulse:contributionTo` to
2525
`target_repository.id`.
2626
- Use canonical person and repository IDs from known entities.
27-
- Build the composite id deterministically: `{target_person.id}_{target_repository.id}`,
28-
set `idSource = "pulse:composite"`, and put the same composite into
29-
`identifiers["pulse:composite"]`.
27+
- Build the composite id deterministically with a **double-underscore**
28+
separator: `{target_person.id}__{target_repository.id}`. Single `_`
29+
is ambiguous because GitHub usernames may contain `_` and the
30+
composite cannot be parsed back to its components. Example:
31+
`https://github.com/alice-smith__https://github.com/lis-epfl/vswarm`.
32+
Set `idSource = "pulse:composite"` and put the same composite
33+
string into `identifiers["pulse:composite"]`.
3034
- `contribution_seed` is the repository id (kept for backwards compatibility) —
3135
prefer `target_repository.id` over it when both are present.
3236
- Do not invent unsupported fields.

src/v2/agents/llm/membership/agent.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,102 @@ def _list_of_dicts(value: Any, *, max_items: int = MAX_CONTEXT_ENTITIES) -> list
7272
return collected
7373

7474

75+
def _country_code_from_org(organization: Any) -> str | None:
76+
"""Best-effort 2-letter country code from an org payload.
77+
78+
Looks at common shapes produced by upstream stages: a bare
79+
``country_code``, a nested ``country.country_code`` (ROR's own
80+
serialisation), or an ``addresses[0].country_code`` (also ROR).
81+
Case-normalised; returns ``None`` when no plausible code is found.
82+
"""
83+
if not isinstance(organization, dict):
84+
return None
85+
direct = organization.get("country_code")
86+
if isinstance(direct, str) and len(direct.strip()) == 2:
87+
return direct.strip().upper()
88+
country = organization.get("country")
89+
if isinstance(country, dict):
90+
nested = country.get("country_code") or country.get("code")
91+
if isinstance(nested, str) and len(nested.strip()) == 2:
92+
return nested.strip().upper()
93+
addresses = organization.get("addresses")
94+
if isinstance(addresses, list) and addresses:
95+
first = addresses[0]
96+
if isinstance(first, dict):
97+
code = first.get("country_code") or first.get("code")
98+
if isinstance(code, str) and len(code.strip()) == 2:
99+
return code.strip().upper()
100+
return None
101+
102+
103+
def _collect_allowed_org_ids(
104+
*,
105+
target_organizations: list[dict[str, Any]],
106+
known_organizations: Any,
107+
organization_derivations: Any,
108+
) -> set[str]:
109+
"""Return the set of org `@id`s the LLM is allowed to point Memberships at.
110+
111+
The membership agent's job is to express employment/affiliation for a
112+
target person inside the graph that other agents have already built.
113+
A Membership target outside this set is, in practice, an LLM-invented
114+
cross-reference (typically from `query_orcid` returning a name-fuzzy
115+
hit at an unrelated company). Returns an empty set when no
116+
organizations have been surfaced at all — then the filter degrades
117+
open (lets the LLM's choice through) so we don't strangle deployments
118+
that genuinely run without an org-detection upstream.
119+
"""
120+
allowed: set[str] = set()
121+
for org in target_organizations:
122+
if isinstance(org, dict):
123+
oid = org.get("id") or org.get("@id")
124+
if isinstance(oid, str) and oid.strip():
125+
allowed.add(oid.strip())
126+
for collection in (known_organizations, organization_derivations):
127+
if not isinstance(collection, list):
128+
continue
129+
for org in collection:
130+
if isinstance(org, dict):
131+
oid = org.get("id") or org.get("@id")
132+
if isinstance(oid, str) and oid.strip():
133+
allowed.add(oid.strip())
134+
return allowed
135+
136+
137+
def _infer_target_country_code(
138+
*,
139+
target_organizations: list[dict[str, Any]],
140+
repository_context: Any,
141+
pipeline_outputs: Any,
142+
) -> str | None:
143+
"""Pick the country code the LLM should default-bias toward.
144+
145+
Priority: explicit `target_organizations[*].country_code` >
146+
repository's owning-org country (from `repository_context.owning_org`
147+
or `pipeline_outputs.organization.country_code`). Returns ``None``
148+
when nothing is grounded — the LLM then has no bias and falls back
149+
to the prompt's general counter-rules without a CH default.
150+
"""
151+
for org in target_organizations:
152+
code = _country_code_from_org(org)
153+
if code:
154+
return code
155+
if isinstance(repository_context, dict):
156+
owning = repository_context.get("owning_org") or repository_context.get("owner")
157+
code = _country_code_from_org(owning)
158+
if code:
159+
return code
160+
if isinstance(pipeline_outputs, dict):
161+
org_payload = pipeline_outputs.get("organization") or pipeline_outputs.get(
162+
"owning_organization",
163+
)
164+
if isinstance(org_payload, dict):
165+
code = _country_code_from_org(org_payload)
166+
if code:
167+
return code
168+
return None
169+
170+
75171
class LLMMembershipAgentV2:
76172
"""LLM-backed agent that produces an org:Membership entity."""
77173

@@ -136,6 +232,21 @@ async def run(
136232
if isinstance(pipeline_outputs, dict) and pipeline_outputs:
137233
llm_input["pipeline_outputs"] = pipeline_outputs
138234

235+
# Country prior derived from the repo's owning org. Without
236+
# this, the LLM happily stamps Memberships to RaySearch (SE),
237+
# 10X Genomics (SE), Volvo Cars (SE), Spotify (SE), Statistics
238+
# Botswana, etc. for GitHub contributors of EPFL/SDSC repos
239+
# whose usernames merely *resemble* unrelated company slugs.
240+
# The prompt's "Counter-rules" section instructs the model to
241+
# default-reject candidate orgs whose ROR country differs.
242+
target_country_code = _infer_target_country_code(
243+
target_organizations=normalized_target_organizations,
244+
repository_context=repository_context,
245+
pipeline_outputs=pipeline_outputs,
246+
)
247+
if target_country_code:
248+
llm_input["target_country_code"] = target_country_code
249+
139250
context_json = json.dumps(llm_input, ensure_ascii=True, sort_keys=True, default=str)
140251
user_prompt = _USER_PROMPT_TEMPLATE.replace("{context_json}", context_json)
141252
user_prompt = append_runtime_prompt_context(user_prompt, context)
@@ -198,6 +309,56 @@ def _safe_date(value: object) -> str | None:
198309
if beg and end and beg > end:
199310
payload["time:hasBeginning"], payload["time:hasEnd"] = end, beg
200311

312+
# Spurious-membership filter — code-level enforcement of the
313+
# emit-or-skip prompt rules. The LLM was observed ignoring the
314+
# counter-rules in `system_prompt.md`: it still stamps
315+
# Memberships to ROR orgs that only matched via a `query_orcid`
316+
# name fuzzy hit (e.g. `jamalsenouci` (GitHub contributor of an
317+
# EPFL repo) → Spotify (ror.org/00hbd6420) just because some
318+
# *other* "Jamal Senouci" works at Spotify). Reject any
319+
# Membership whose target org isn't reachable from the
320+
# context — the cost of a false negative (legitimate ORCID
321+
# employment dropped because the org wasn't surfaced upstream)
322+
# is far smaller than the false-positive misattribution noise.
323+
target_org_ref = payload.get("org:organization")
324+
if isinstance(target_org_ref, dict):
325+
target_org_id = target_org_ref.get("@id") or target_org_ref.get("id")
326+
else:
327+
target_org_id = target_org_ref
328+
if isinstance(target_org_id, str) and target_org_id.strip():
329+
allowed_org_ids = _collect_allowed_org_ids(
330+
target_organizations=normalized_target_organizations,
331+
known_organizations=llm_input.get("known_organizations", []),
332+
organization_derivations=llm_input.get("organization_derivations", []),
333+
)
334+
if allowed_org_ids and target_org_id.strip() not in allowed_org_ids:
335+
warning = (
336+
f"llm_membership: dropping Membership {payload.get('id')!r} — "
337+
f"target org {target_org_id!r} not present in known_organizations "
338+
"(likely a spurious `query_orcid` name match; the LLM ignored "
339+
"the emit-or-skip rules in the system prompt)."
340+
)
341+
logger.info(warning)
342+
return AgentResult(
343+
data={},
344+
warnings=[warning],
345+
raw_output={},
346+
model=llm_result.model,
347+
provider=llm_result.provider,
348+
tokens_prompt=llm_result.tokens_prompt,
349+
tokens_completion=llm_result.tokens_completion,
350+
stats={
351+
"agent_runtime": "llm",
352+
"memberships": [],
353+
"membership_count": 0,
354+
"derivation": {
355+
"membership_seed": membership_seed,
356+
"skipped_reason": "org_not_in_context",
357+
"target_org_id": target_org_id,
358+
},
359+
},
360+
)
361+
201362
raw_output = deepcopy(payload)
202363
validation_warnings = _strict_validate(payload)
203364

0 commit comments

Comments
 (0)