Merge pull request #39 from Imaging-Plaza/feat/v2-pipeline-perf

caviri · web-flow · commit da24a0e8b85d · 2026-05-18T21:12:27.000+02:00
perf(v2 pipeline): parallel agents + LLM iteration cap + tighter arti…
diff --git a/src/v2/agents/llm/article/prompts/system_prompt.md b/src/v2/agents/llm/article/prompts/system_prompt.md
@@ -24,6 +24,15 @@ Rules:
 - Do not invent unknown people or organizations when canonical IDs are available in context.
 - Do not emit fields outside the schema.
 - Use `null` only where nullable fields are permitted.
+- **Stop early.** Once you have a concrete DOI (or a concrete
+  `pulse:infoscienceArticleIdentifier`) plus a title, **emit the JSON
+  immediately**. Do not cross-validate with more than two sources, do not
+  re-search the same query, and do not chase tangential leads (READMEs of
+  related repos, contributor pages, alternative venues). Each extra tool
+  call costs ~5s and the same DOI never changes.
+- If two tool calls in a row return the same paper but neither yields a
+  DOI/Infoscience id, accept that the article is not findable and emit
+  `{}`. Don't loop.
 - **No identifier, no article.** If the input does not let you ground the
   article in a real `schema:identifier` (a real DOI such as
   `10.1038/s41586-024-...`) **or** a real `pulse:infoscienceArticleIdentifier`,
diff --git a/src/v2/agents/llm/runtime.py b/src/v2/agents/llm/runtime.py
@@ -9,6 +9,7 @@
 from typing import Any
 
 from pydantic_ai import Agent
+from pydantic_ai.usage import UsageLimits
 
 from src.v1.llm.model_config import (
     create_pydantic_ai_model,
@@ -94,6 +95,45 @@ def _extract_usage_counts(usage: Any) -> tuple[int | None, int | None]:
     return normalized_requests, normalized_tool_calls
 
 
+_DEFAULT_REQUEST_LIMIT_ENV = "V2_LLM_REQUEST_LIMIT"
+_DEFAULT_TOOL_CALLS_LIMIT_ENV = "V2_LLM_TOOL_CALLS_LIMIT"
+_DEFAULT_REQUEST_LIMIT = 25
+_DEFAULT_TOOL_CALLS_LIMIT = 50
+
+
+def _coerce_positive_int_env(name: str, fallback: int) -> int:
+    raw = os.getenv(name)
+    if raw is None:
+        return fallback
+    try:
+        value = int(raw.strip())
+    except ValueError:
+        return fallback
+    return max(1, value)
+
+
+def _default_usage_limits() -> UsageLimits:
+    """Per-agent caps on LLM roundtrips and tool calls.
+
+    Without an explicit cap, observed: the article agent looped >100 tool
+    calls (12 min wall) on a paper-heavy repo, exploring every tool every
+    time. The cap turns runaway loops into clean ``UsageLimitExceeded``
+    errors that the per-stage runner surfaces as a single warning instead
+    of consuming the whole job budget.
+
+    Tunable per deployment via ``V2_LLM_REQUEST_LIMIT`` /
+    ``V2_LLM_TOOL_CALLS_LIMIT``.
+    """
+    return UsageLimits(
+        request_limit=_coerce_positive_int_env(
+            _DEFAULT_REQUEST_LIMIT_ENV, _DEFAULT_REQUEST_LIMIT,
+        ),
+        tool_calls_limit=_coerce_positive_int_env(
+            _DEFAULT_TOOL_CALLS_LIMIT_ENV, _DEFAULT_TOOL_CALLS_LIMIT,
+        ),
+    )
+
+
 def _coerce_output_payload(output: Any) -> dict[str, Any]:
     """Convert model output into a JSON-object dictionary.
 
@@ -160,6 +200,7 @@ async def run_json_prompt(
         user_prompt: str,
         output_type: Any = None,
         tools: list[Any] | None = None,
+        usage_limits: UsageLimits | None = None,
     ) -> LLMRuntimeResult:
         """Execute a prompt and return a structured JSON payload plus metadata.
 
@@ -170,6 +211,12 @@ async def run_json_prompt(
                 schema. Defaults to ``dict[str, Any]`` when not provided.
                 Passing a Pydantic model class constrains the LLM to produce
                 output that conforms to its schema (first validation pass).
+            usage_limits: Optional pydantic-ai ``UsageLimits``. Defaults to
+                ``_default_usage_limits()``: 25 model requests + 50 tool
+                calls per agent invocation. Observed in profiling: the
+                article agent would loop 100+ tool calls without a cap and
+                spend 12 minutes on a single repo; the default keeps any
+                agent's wall time bounded.
         """
 
         resolved_output_type: Any = output_type if output_type is not None else dict[str, Any]
@@ -199,11 +246,18 @@ async def run_json_prompt(
             if model_parameters and "model_settings" in run_parameters:
                 run_kwargs["model_settings"] = model_parameters
 
+            effective_limits = usage_limits or _default_usage_limits()
+            if "usage_limits" in run_parameters:
+                run_kwargs["usage_limits"] = effective_limits
+
             logger.info(
-                "LLM runtime start (provider=%s, model=%s, tools=%d)",
+                "LLM runtime start (provider=%s, model=%s, tools=%d, "
+                "request_limit=%s, tool_calls_limit=%s)",
                 provider_name,
                 model_name,
                 len(tools or []),
+                effective_limits.request_limit,
+                effective_limits.tool_calls_limit,
             )
             result = await agent.run(user_prompt, **run_kwargs)
         except LLMRuntimeError:
diff --git a/src/v2/api.py b/src/v2/api.py
@@ -203,20 +203,25 @@ def _extract_path_kind(source_url: str) -> str | None:
 
 
 def _resolve_max_concurrent_agents() -> int:
-    """Read `V2_MAX_CONCURRENT_AGENTS` env var (default 6).
+    """Read `V2_MAX_CONCURRENT_AGENTS` env var (default 8).
 
     Caps how many work items per stage (person agents, contribution agents,
     link-veracity calls, etc.) run in parallel within a single /extract
     request. Higher values speed up wide-fanout repos at the cost of more
     concurrent LLM calls — keep within the LLM provider's rate limit.
+
+    Default raised from 6 to 8 after profiling a 50-person repo
+    (deeplabcut/deeplabcut): person+membership stages were spending ~12
+    minutes waiting on the semaphore. The RCP/LLM stack absorbed 8
+    in-flight calls without thermal throttling in that test.
     """
     raw = os.getenv("V2_MAX_CONCURRENT_AGENTS")
     if raw is None:
-        return 6
+        return 8
     try:
         value = int(raw.strip())
     except ValueError:
-        return 6
+        return 8
     return max(1, value)
 
 
diff --git a/src/v2/pipeline/orchestrator.py b/src/v2/pipeline/orchestrator.py
@@ -203,7 +203,7 @@ def __init__(  # noqa: PLR0913
         retry_max_retries: int = 3,
         retry_backoff_base: float = 0.0,
         retry_sleep_func: SleepCallable | None = None,
-        max_concurrent_agents: int = 3,
+        max_concurrent_agents: int = 8,
         include_upstream_stage_outputs_in_prompt: bool = True,
         user_prompt_appendix: str | None = None,
         cache: ProviderCache | None = None,