Imaging-Plaza
diff --git a/‎.env.example‎
Lines changed: 18 additions & 0 deletions b/‎.env.example‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/v2/agents/llm/context_summary/agent.py‎
Lines changed: 32 additions & 1 deletion b/‎src/v2/agents/llm/context_summary/agent.py‎
Lines changed: 32 additions & 1 deletion
diff --git a/‎src/v2/agents/llm/contribution/agent.py‎
Lines changed: 47 additions & 1 deletion b/‎src/v2/agents/llm/contribution/agent.py‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎src/v2/agents/llm/contribution/prompts/system_prompt.md‎
Lines changed: 8 additions & 4 deletions b/‎src/v2/agents/llm/contribution/prompts/system_prompt.md‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎src/v2/agents/llm/membership/agent.py‎
Lines changed: 161 additions & 0 deletions b/‎src/v2/agents/llm/membership/agent.py‎
Lines changed: 161 additions & 0 deletions
@@ -287,6 +287,24 @@ OPENROUTER_API_KEY=your-openrouter-key-here
 # reasoning about the repo it is processing.
 # V2_GITHUB_RAG_ENABLED=true
 
+# Auto-ingest every newly seen PUBLIC repository into the GitHub RAG
+# index after a successful /v2/extract call. Off by default so existing
+# deployments don't change behaviour. When `true`, an async background
+# task fires after each completed repository extraction:
+#
+#   1. Skip if the repo is already in the GitHub DuckDB (no refresh).
+#   2. Fetch its metadata + README via the GitHub REST API.
+#   3. Persist the row to DuckDB and embed the README chunks into the
+#      `github_repos` Qdrant collection.
+#   4. Private/unreachable repos surface as `skipped_404` and emit one
+#      log line — no caller-visible error.
+#
+# This grows the GitHub RAG index organically as repos are processed,
+# so subsequent extractions find them via `search_github_rag` without
+# manual cli ingest runs. Module-level lock serialises DuckDB writes
+# across uvicorn workers, so it's safe with `WORKERS=4` or higher.
+# V2_GITHUB_RAG_AUTO_INGEST=false
+
 
 EPFL_GRAPH_USERNAME=
 EPFL_GRAPH_PASSWORD=
 
@@ -15,13 +15,32 @@
 from src.v2.agents.llm.agent_tools.repository_corpus_grep import (
     make_repository_corpus_grep_tool,
 )
+from pydantic_ai.usage import UsageLimits
+
 from src.v2.agents.models import AgentResult, ProviderSet
 from src.v2.agents.llm.runtime import LLMRuntimeError, V2LLMRuntime
 from src.v2.ingest.cache import ProviderCache
 from src.v2.observation.query_log import stamp_current_agent
 
 logger = logging.getLogger(__name__)
 
+# Scout-mode usage budget. The scout prompt explicitly tells the LLM to
+# spend ~20 tool calls up-front so per-entity agents don't have to; the
+# global `V2_LLM_REQUEST_LIMIT` default of 25 (set in
+# `src/v2/agents/llm/runtime.py`) caps it before it finishes and the
+# brief comes back empty, leaving downstream agents to re-discover every
+# ORCID / ROR / DOI themselves. Override locally so scout gets the
+# budget it was designed for. Per-call request count tracked by
+# pydantic-ai; tool-call budget gates the actual external lookups.
+#
+# TODO: lift these into a general agent-tuning config file (alongside
+# `V2_LLM_REQUEST_LIMIT` / `V2_LLM_TOOL_CALLS_LIMIT`) so every agent can
+# declare its own budget without touching code. Today the two knobs are
+# global env vars + this single per-agent override; a YAML/TOML config
+# keyed by agent name would scale better as more agents need tuning.
+_SCOUT_REQUEST_LIMIT = 60
+_SCOUT_TOOL_CALLS_LIMIT = 120
+
 _PROMPTS_PACKAGE = "src.v2.agents.llm.context_summary.prompts"
 _SYSTEM_PROMPT = load_prompt(_PROMPTS_PACKAGE, "system_prompt.md")
 _USER_PROMPT_TEMPLATE = load_prompt(_PROMPTS_PACKAGE, "user_prompt.md")
@@ -416,16 +435,24 @@ async def run(
         # is unchanged (2 tools, original prompt). Scout mode adds the
         # RAG providers' search tools so people / orgs / articles get
         # recon'd up-front.
+        usage_limits: UsageLimits | None = None
         if scout_mode:
             system_prompt = load_prompt(_PROMPTS_PACKAGE, _SCOUT_PROMPT_FILENAME)
             tools = _build_scout_tools(
                 providers=providers,
                 cache=self._cache,
                 corpus_documents=corpus_documents,
             )
+            usage_limits = UsageLimits(
+                request_limit=_SCOUT_REQUEST_LIMIT,
+                tool_calls_limit=_SCOUT_TOOL_CALLS_LIMIT,
+            )
             logger.info(
-                "context_summary_agent: scout mode ON (%d tools)",
+                "context_summary_agent: scout mode ON (%d tools, "
+                "request_limit=%d, tool_calls_limit=%d)",
                 len(tools),
+                _SCOUT_REQUEST_LIMIT,
+                _SCOUT_TOOL_CALLS_LIMIT,
             )
         else:
             system_prompt = _SYSTEM_PROMPT
@@ -435,11 +462,15 @@ async def run(
             ]
 
         try:
+            run_kwargs: dict[str, Any] = {}
+            if usage_limits is not None:
+                run_kwargs["usage_limits"] = usage_limits
             llm_result = await self._llm_runtime.run_json_prompt(
                 system_prompt=system_prompt,
                 user_prompt=user_prompt,
                 output_type=LLMContextSummaryOutput,
                 tools=tools,
+                **run_kwargs,
             )
         except LLMRuntimeError as exc:
             warning = f"context_summary_agent: LLM call failed; proceeding without compiled summary ({exc})"
 
@@ -271,7 +271,15 @@ async def run(
 
         payload["schema:author"] = target_person_id
         payload["pulse:contributionTo"] = target_repository_id
-        composite_id = f"{target_person_id}_{target_repository_id}"
+        # Double underscore separator — single `_` is ambiguous because
+        # GitHub usernames are allowed to contain `_` (and URLs in the
+        # repo id contain `/`), so `<person_url>_<repo_url>` cannot be
+        # parsed back unambiguously. `__` never appears in either a
+        # github.com URL or a ROR identifier, so the composite is
+        # round-trippable. Migration: any previously persisted edges
+        # with single-`_` separator are still readable by consumers —
+        # only newly emitted edges adopt the new shape.
+        composite_id = f"{target_person_id}__{target_repository_id}"
         payload["id"] = composite_id
         payload["idSource"] = "pulse:composite"
         identifiers = payload.get("identifiers")
@@ -302,6 +310,44 @@ async def run(
 
         force_server_uuid(payload, uuid_value)
 
+        # Drop empty edges. Production audit observed 28 contributions
+        # landing with `pulse:contributionCount=0` and both date fields
+        # `null` — vacuous edges that bloat the graph without carrying
+        # any signal. They arise when the LLM emits a Contribution from
+        # a weak signal (a comment, a watch, a review) but the GitHub
+        # contributor record never produced a commit count or date.
+        contribution_count = payload.get("pulse:contributionCount")
+        first_contribution_date = payload.get("pulse:firstContributionDate")
+        last_contribution_date = payload.get("pulse:lastContributionDate")
+        if (
+            (contribution_count is None or contribution_count == 0)
+            and not first_contribution_date
+            and not last_contribution_date
+        ):
+            warning = (
+                f"llm_contribution: dropping Contribution {payload.get('id')!r} — "
+                "no count, no firstContributionDate, no lastContributionDate"
+            )
+            logger.info(warning)
+            return AgentResult(
+                data={},
+                warnings=[warning],
+                raw_output={},
+                model=llm_result.model,
+                provider=llm_result.provider,
+                tokens_prompt=llm_result.tokens_prompt,
+                tokens_completion=llm_result.tokens_completion,
+                stats={
+                    "agent_runtime": "llm",
+                    "contributions": [],
+                    "contribution_count": 0,
+                    "derivation": {
+                        "contribution_seed": contribution_seed,
+                        "skipped_reason": "empty_edge",
+                    },
+                },
+            )
+
         raw_output = deepcopy(payload)
         validation_warnings = _strict_validate(payload)
 
 
@@ -5,7 +5,7 @@ Return exactly one JSON object for a `pulse:Contribution` entity conforming to `
 Output only JSON. No markdown fences. No explanations.
 
 Required fields:
-- `id` (composite `personId_repoId` or UUID fallback)
+- `id` (composite `personId__repoId` — DOUBLE underscore — or UUID fallback)
 - `type` = `"pulse:Contribution"`
 - `shacl` = `"pulse:ContributionShape"`
 - `identifiers` with `pulse:composite` and `uuid`
@@ -24,9 +24,13 @@ Rules:
   `schema:author` to `target_person.id` and `pulse:contributionTo` to
   `target_repository.id`.
 - Use canonical person and repository IDs from known entities.
-- Build the composite id deterministically: `{target_person.id}_{target_repository.id}`,
-  set `idSource = "pulse:composite"`, and put the same composite into
-  `identifiers["pulse:composite"]`.
+- Build the composite id deterministically with a **double-underscore**
+  separator: `{target_person.id}__{target_repository.id}`. Single `_`
+  is ambiguous because GitHub usernames may contain `_` and the
+  composite cannot be parsed back to its components. Example:
+  `https://github.com/alice-smith__https://github.com/lis-epfl/vswarm`.
+  Set `idSource = "pulse:composite"` and put the same composite
+  string into `identifiers["pulse:composite"]`.
 - `contribution_seed` is the repository id (kept for backwards compatibility) —
   prefer `target_repository.id` over it when both are present.
 - Do not invent unsupported fields.
@@ -72,6 +72,102 @@ def _list_of_dicts(value: Any, *, max_items: int = MAX_CONTEXT_ENTITIES) -> list
     return collected
 
 
+def _country_code_from_org(organization: Any) -> str | None:
+    """Best-effort 2-letter country code from an org payload.
+
+    Looks at common shapes produced by upstream stages: a bare
+    ``country_code``, a nested ``country.country_code`` (ROR's own
+    serialisation), or an ``addresses[0].country_code`` (also ROR).
+    Case-normalised; returns ``None`` when no plausible code is found.
+    """
+    if not isinstance(organization, dict):
+        return None
+    direct = organization.get("country_code")
+    if isinstance(direct, str) and len(direct.strip()) == 2:
+        return direct.strip().upper()
+    country = organization.get("country")
+    if isinstance(country, dict):
+        nested = country.get("country_code") or country.get("code")
+        if isinstance(nested, str) and len(nested.strip()) == 2:
+            return nested.strip().upper()
+    addresses = organization.get("addresses")
+    if isinstance(addresses, list) and addresses:
+        first = addresses[0]
+        if isinstance(first, dict):
+            code = first.get("country_code") or first.get("code")
+            if isinstance(code, str) and len(code.strip()) == 2:
+                return code.strip().upper()
+    return None
+
+
+def _collect_allowed_org_ids(
+    *,
+    target_organizations: list[dict[str, Any]],
+    known_organizations: Any,
+    organization_derivations: Any,
+) -> set[str]:
+    """Return the set of org `@id`s the LLM is allowed to point Memberships at.
+
+    The membership agent's job is to express employment/affiliation for a
+    target person inside the graph that other agents have already built.
+    A Membership target outside this set is, in practice, an LLM-invented
+    cross-reference (typically from `query_orcid` returning a name-fuzzy
+    hit at an unrelated company). Returns an empty set when no
+    organizations have been surfaced at all — then the filter degrades
+    open (lets the LLM's choice through) so we don't strangle deployments
+    that genuinely run without an org-detection upstream.
+    """
+    allowed: set[str] = set()
+    for org in target_organizations:
+        if isinstance(org, dict):
+            oid = org.get("id") or org.get("@id")
+            if isinstance(oid, str) and oid.strip():
+                allowed.add(oid.strip())
+    for collection in (known_organizations, organization_derivations):
+        if not isinstance(collection, list):
+            continue
+        for org in collection:
+            if isinstance(org, dict):
+                oid = org.get("id") or org.get("@id")
+                if isinstance(oid, str) and oid.strip():
+                    allowed.add(oid.strip())
+    return allowed
+
+
+def _infer_target_country_code(
+    *,
+    target_organizations: list[dict[str, Any]],
+    repository_context: Any,
+    pipeline_outputs: Any,
+) -> str | None:
+    """Pick the country code the LLM should default-bias toward.
+
+    Priority: explicit `target_organizations[*].country_code` >
+    repository's owning-org country (from `repository_context.owning_org`
+    or `pipeline_outputs.organization.country_code`). Returns ``None``
+    when nothing is grounded — the LLM then has no bias and falls back
+    to the prompt's general counter-rules without a CH default.
+    """
+    for org in target_organizations:
+        code = _country_code_from_org(org)
+        if code:
+            return code
+    if isinstance(repository_context, dict):
+        owning = repository_context.get("owning_org") or repository_context.get("owner")
+        code = _country_code_from_org(owning)
+        if code:
+            return code
+    if isinstance(pipeline_outputs, dict):
+        org_payload = pipeline_outputs.get("organization") or pipeline_outputs.get(
+            "owning_organization",
+        )
+        if isinstance(org_payload, dict):
+            code = _country_code_from_org(org_payload)
+            if code:
+                return code
+    return None
+
+
 class LLMMembershipAgentV2:
     """LLM-backed agent that produces an org:Membership entity."""
 
@@ -136,6 +232,21 @@ async def run(
         if isinstance(pipeline_outputs, dict) and pipeline_outputs:
             llm_input["pipeline_outputs"] = pipeline_outputs
 
+        # Country prior derived from the repo's owning org. Without
+        # this, the LLM happily stamps Memberships to RaySearch (SE),
+        # 10X Genomics (SE), Volvo Cars (SE), Spotify (SE), Statistics
+        # Botswana, etc. for GitHub contributors of EPFL/SDSC repos
+        # whose usernames merely *resemble* unrelated company slugs.
+        # The prompt's "Counter-rules" section instructs the model to
+        # default-reject candidate orgs whose ROR country differs.
+        target_country_code = _infer_target_country_code(
+            target_organizations=normalized_target_organizations,
+            repository_context=repository_context,
+            pipeline_outputs=pipeline_outputs,
+        )
+        if target_country_code:
+            llm_input["target_country_code"] = target_country_code
+
         context_json = json.dumps(llm_input, ensure_ascii=True, sort_keys=True, default=str)
         user_prompt = _USER_PROMPT_TEMPLATE.replace("{context_json}", context_json)
         user_prompt = append_runtime_prompt_context(user_prompt, context)
@@ -198,6 +309,56 @@ def _safe_date(value: object) -> str | None:
         if beg and end and beg > end:
             payload["time:hasBeginning"], payload["time:hasEnd"] = end, beg
 
+        # Spurious-membership filter — code-level enforcement of the
+        # emit-or-skip prompt rules. The LLM was observed ignoring the
+        # counter-rules in `system_prompt.md`: it still stamps
+        # Memberships to ROR orgs that only matched via a `query_orcid`
+        # name fuzzy hit (e.g. `jamalsenouci` (GitHub contributor of an
+        # EPFL repo) → Spotify (ror.org/00hbd6420) just because some
+        # *other* "Jamal Senouci" works at Spotify). Reject any
+        # Membership whose target org isn't reachable from the
+        # context — the cost of a false negative (legitimate ORCID
+        # employment dropped because the org wasn't surfaced upstream)
+        # is far smaller than the false-positive misattribution noise.
+        target_org_ref = payload.get("org:organization")
+        if isinstance(target_org_ref, dict):
+            target_org_id = target_org_ref.get("@id") or target_org_ref.get("id")
+        else:
+            target_org_id = target_org_ref
+        if isinstance(target_org_id, str) and target_org_id.strip():
+            allowed_org_ids = _collect_allowed_org_ids(
+                target_organizations=normalized_target_organizations,
+                known_organizations=llm_input.get("known_organizations", []),
+                organization_derivations=llm_input.get("organization_derivations", []),
+            )
+            if allowed_org_ids and target_org_id.strip() not in allowed_org_ids:
+                warning = (
+                    f"llm_membership: dropping Membership {payload.get('id')!r} — "
+                    f"target org {target_org_id!r} not present in known_organizations "
+                    "(likely a spurious `query_orcid` name match; the LLM ignored "
+                    "the emit-or-skip rules in the system prompt)."
+                )
+                logger.info(warning)
+                return AgentResult(
+                    data={},
+                    warnings=[warning],
+                    raw_output={},
+                    model=llm_result.model,
+                    provider=llm_result.provider,
+                    tokens_prompt=llm_result.tokens_prompt,
+                    tokens_completion=llm_result.tokens_completion,
+                    stats={
+                        "agent_runtime": "llm",
+                        "memberships": [],
+                        "membership_count": 0,
+                        "derivation": {
+                            "membership_seed": membership_seed,
+                            "skipped_reason": "org_not_in_context",
+                            "target_org_id": target_org_id,
+                        },
+                    },
+                )
+
         raw_output = deepcopy(payload)
         validation_warnings = _strict_validate(payload)