Skip to content

Commit ce4eb7b

Browse files
committed
feat(github integration): Enhance GitHub provider with retry logic for transient errors and add GitHub rate limit probing. Update .env.example to include configuration for maximum retry attempts. Modify API health checks to report GitHub rate limit status. Introduce validation for organization handles against GitHub API to ensure accurate entity representation.
1 parent e53c63b commit ce4eb7b

25 files changed

Lines changed: 1045 additions & 39 deletions

.env.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@
2222
# `src/index/github/` indexer (REST metadata + README fetches).
2323
GITHUB_TOKEN=your-github-token-here
2424

25+
# Bound for retrying transient GitHub provider failures (gimie returning
26+
# empty/malformed JSON during 502s, rate-limit blips, etc.). Default 3.
27+
# Set to 0 to disable. Non-transient errors (NOT_FOUND, UnicodeDecodeError)
28+
# never retry — UnicodeDecodeError falls back to a minimal payload instead.
29+
# V2_GITHUB_REPO_MAX_RETRIES=3
30+
2531
# ---------------------------------------------------------------------------
2632
# LLM provider credentials (only the one(s) referenced by your model config
2733
# need to be set — `src/v2/agents/llm/runtime.py` validates this at startup

src/api.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,25 @@ def _resolve_package_version(name: str) -> str:
7575
logger = logging.getLogger(__name__)
7676

7777

78-
async def startup_event():
79-
"""Initialize resources on application startup"""
78+
async def startup_event(app: FastAPI | None = None):
79+
"""Initialize resources on application startup."""
8080
logger.info("🚀 Application startup - initializing resources")
8181

82+
# Pre-warm the v2 provider cache so the SQLite file exists in WAL mode
83+
# before any request hits a worker. Without this, multi-worker uvicorn
84+
# against a cold .cache/ races on first-request to create+init the
85+
# file and the losing workers raise `database is locked`, surfacing
86+
# as 500s for the first 1-2 jobs of a batch run.
87+
if app is not None:
88+
try:
89+
from src.v2.dependencies import _resolve_provider_cache
90+
91+
cache = _resolve_provider_cache(app.state)
92+
if cache is not None:
93+
logger.info("✅ v2 provider cache initialized")
94+
except Exception as exc: # noqa: BLE001 — startup must not crash on cache issues
95+
logger.warning(f"v2 provider cache pre-warm skipped: {exc}")
96+
8297

8398
async def shutdown_event():
8499
"""Cleanup resources on application shutdown"""
@@ -119,8 +134,8 @@ async def shutdown_event():
119134

120135

121136
@asynccontextmanager
122-
async def lifespan(_: FastAPI):
123-
await startup_event()
137+
async def lifespan(app: FastAPI):
138+
await startup_event(app)
124139
try:
125140
yield
126141
finally:

src/v1/data_models/infoscience.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,17 @@ class InfosciencePublication(BaseModel):
2424
description="List of author names",
2525
default_factory=list,
2626
)
27+
author_authorities: List[Optional[str]] = Field(
28+
description=(
29+
"Parallel array to `authors` (same length, same order) holding "
30+
"the DSpace `authority` UUID for each author when available "
31+
"(EPFL-affiliated, authority-controlled). `None` for non-EPFL "
32+
"authors that DSpace stores as bare names. Used by v2 to bind "
33+
"publication authors directly to Infoscience person entities "
34+
"without name-based fuzzy matching."
35+
),
36+
default_factory=list,
37+
)
2738
abstract: Optional[str] = Field(
2839
description="Publication abstract or description",
2940
default=None,

src/v2/agents/llm/refiners/membership/agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def __init__(
5151
self,
5252
*,
5353
llm_runtime: V2LLMRuntime | None = None,
54-
llm_call_timeout_seconds: float = 30.0,
54+
llm_call_timeout_seconds: float = 120.0,
5555
) -> None:
5656
if llm_call_timeout_seconds <= 0:
5757
message = "llm_call_timeout_seconds must be > 0"

src/v2/agents/llm/refiners/organization/agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def __init__(
5858
self,
5959
*,
6060
llm_runtime: V2LLMRuntime | None = None,
61-
llm_call_timeout_seconds: float = 60.0,
61+
llm_call_timeout_seconds: float = 120.0,
6262
) -> None:
6363
if llm_call_timeout_seconds <= 0:
6464
message = "llm_call_timeout_seconds must be > 0"

src/v2/agents/llm/refiners/person/agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def __init__(
4747
self,
4848
*,
4949
llm_runtime: V2LLMRuntime | None = None,
50-
llm_call_timeout_seconds: float = 60.0,
50+
llm_call_timeout_seconds: float = 120.0,
5151
) -> None:
5252
if llm_call_timeout_seconds <= 0:
5353
message = "llm_call_timeout_seconds must be > 0"

src/v2/agents/llm/refiners/repository/agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def __init__(
6060
self,
6161
*,
6262
llm_runtime: V2LLMRuntime | None = None,
63-
llm_call_timeout_seconds: float = 60.0,
63+
llm_call_timeout_seconds: float = 120.0,
6464
) -> None:
6565
if llm_call_timeout_seconds <= 0:
6666
message = "llm_call_timeout_seconds must be > 0"

src/v2/agents/rule_based/article_agent.py

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,15 @@ def _build_person_lookup(
274274
persons: list[dict[str, Any]],
275275
*,
276276
person_derivations: Any = None,
277-
) -> dict[str, str]:
277+
) -> tuple[dict[str, str], dict[str, str]]:
278+
"""Build the alias→person_id lookup plus an infoscience-authority→person_id map.
279+
280+
The authority lookup is keyed on raw DSpace authority UUIDs (``pulse:infosciencePersonIdentifier``)
281+
so article authors can be mapped directly without name fuzzy-matching when DSpace returned
282+
an authority on the publication.
283+
"""
278284
lookup: dict[str, str] = {}
285+
authority_lookup: dict[str, str] = {}
279286
derivation_by_id: dict[str, dict[str, Any]] = {}
280287
if isinstance(person_derivations, list):
281288
for derivation in person_derivations:
@@ -296,7 +303,17 @@ def _build_person_lookup(
296303
derivation_by_id=derivation_by_id,
297304
):
298305
_register_lookup_token(lookup, alias_token, person_id)
299-
return lookup
306+
307+
infoscience_id = _as_string(person.get("pulse:infosciencePersonIdentifier"))
308+
if infoscience_id is None:
309+
identifiers = person.get("identifiers")
310+
if isinstance(identifiers, dict):
311+
infoscience_id = _as_string(
312+
identifiers.get("pulse:infosciencePersonIdentifier"),
313+
)
314+
if infoscience_id:
315+
authority_lookup.setdefault(infoscience_id, person_id)
316+
return lookup, authority_lookup
300317

301318

302319
def _build_organization_lookup(organizations: list[dict[str, Any]]) -> dict[str, str]: # noqa: C901
@@ -489,6 +506,7 @@ def _map_author_ids(
489506
publication: dict[str, Any],
490507
*,
491508
person_lookup: dict[str, str],
509+
authority_lookup: dict[str, str],
492510
publication_reference: str,
493511
) -> tuple[list[str], list[str], list[str], int, int]:
494512
warnings: list[str] = []
@@ -497,8 +515,22 @@ def _map_author_ids(
497515
matched_authors = 0
498516
unresolved_count = 0
499517
author_names = _as_string_list(publication.get("authors"))
500-
for author_name in author_names:
501-
resolved_author = _resolve_lookup_token(person_lookup, author_name)
518+
raw_authorities = publication.get("author_authorities")
519+
authority_values: list[str | None]
520+
if isinstance(raw_authorities, list) and len(raw_authorities) == len(author_names):
521+
authority_values = [
522+
item if isinstance(item, str) and item.strip() else None
523+
for item in raw_authorities
524+
]
525+
else:
526+
authority_values = [None] * len(author_names)
527+
528+
for author_name, authority in zip(author_names, authority_values, strict=False):
529+
resolved_author: str | None = None
530+
if authority is not None:
531+
resolved_author = authority_lookup.get(authority)
532+
if resolved_author is None:
533+
resolved_author = _resolve_lookup_token(person_lookup, author_name)
502534
if isinstance(resolved_author, str):
503535
mapped_authors.append(resolved_author)
504536
matched_authors += 1
@@ -749,7 +781,7 @@ async def run( # noqa: C901, PLR0912, PLR0915
749781
stats={"queries": queries, "articles": []},
750782
)
751783

752-
person_lookup = _build_person_lookup(
784+
person_lookup, authority_lookup = _build_person_lookup(
753785
_collect_known_persons(context),
754786
person_derivations=context.get("person_derivations"),
755787
)
@@ -773,6 +805,7 @@ async def run( # noqa: C901, PLR0912, PLR0915
773805
) = _map_author_ids(
774806
candidate.publication,
775807
person_lookup=person_lookup,
808+
authority_lookup=authority_lookup,
776809
publication_reference=publication_reference,
777810
)
778811
for unresolved_author in unresolved_authors:

src/v2/agents/rule_based/contribution_agent.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,26 @@ def _build_contribution_payload( # noqa: PLR0913
237237
}
238238

239239

240+
def _person_github_login(person: Any) -> str | None:
241+
"""Best-effort GitHub login for a person entity.
242+
243+
Mirrors the LLM contribution agent's helper: prefers the explicit
244+
`pulse:githubUsername`, falls back to parsing `id` when shaped like
245+
`https://github.com/{login}`.
246+
"""
247+
if not isinstance(person, dict):
248+
return None
249+
handle = person.get("pulse:githubUsername")
250+
if isinstance(handle, str) and handle.strip():
251+
return handle.strip()
252+
identifier = person.get("id")
253+
if isinstance(identifier, str) and identifier.startswith("https://github.com/"):
254+
candidate = identifier.removeprefix("https://github.com/").strip("/")
255+
if candidate and "/" not in candidate:
256+
return candidate
257+
return None
258+
259+
240260
class ContributionAgentV2:
241261
"""Deterministic contribution agent from repository contributor metadata."""
242262

@@ -252,6 +272,17 @@ async def run( # noqa: C901
252272
organization_lookup = _build_organization_lookup(_collect_known_organizations(context))
253273
repositories = _collect_known_repositories(context)
254274

275+
# The orchestrator fans out one work item per (person, repo) pair and
276+
# passes both `target_person` and `target_repository`. Honour that
277+
# scoping: only emit a contribution for the target person, never warn
278+
# about other contributors that happen to share the repo's signal list
279+
# (they have their own fanout work item).
280+
target_person = context.get("target_person")
281+
target_login = _person_github_login(target_person)
282+
normalized_target_login = (
283+
target_login.casefold() if isinstance(target_login, str) else None
284+
)
285+
255286
contribution_data_by_composite: dict[str, dict[str, Any]] = {}
256287
for repository in repositories:
257288
repository_id = _as_string(
@@ -268,6 +299,12 @@ async def run( # noqa: C901
268299
if normalized_login is None:
269300
continue
270301

302+
if (
303+
normalized_target_login is not None
304+
and normalized_login != normalized_target_login
305+
):
306+
continue
307+
271308
person_id = person_lookup.get(normalized_login)
272309
if person_id is None:
273310
if normalized_login in organization_lookup:

src/v2/api.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@
3838
from src.v2.ingest.cache import ProviderCache
3939
from src.v2.ingest.detection import UnsupportedGitHubURL, classify_github_url
4040
from src.v2.jobs import JobStore
41+
from src.v2.observation.github_rate_limit import (
42+
GitHubRateLimitSummary,
43+
probe_github_rate_limit,
44+
)
4145
from src.v2.observation.query_log import QueryLog, query_log_var
4246
from src.v2.pipeline import PipelineOrchestrator
4347
from src.v2.pipeline.stages import (
@@ -65,6 +69,9 @@
6569
validate_author_classes,
6670
validate_ownership,
6771
)
72+
from src.v2.pipeline.stages.validate_org_github_handles import (
73+
validate_org_github_handles,
74+
)
6875
from src.v2.pipeline.stages.refine_with_llm import (
6976
is_enabled as _hybrid_refiner_is_enabled,
7077
)
@@ -706,6 +713,19 @@ async def extract( # noqa: C901, PLR0911, PLR0912, PLR0913, PLR0915
706713
for warning in repo_author_warnings:
707714
_append_unique_warning(warnings, warning)
708715

716+
# Validate `@handle`-style org names against GitHub before strict
717+
# validation so we either stamp the missing handle or drop the
718+
# hallucinated entity (rather than just losing it to anyOf).
719+
stage_started_at = perf_counter()
720+
reconciled, org_handle_warnings = validate_org_github_handles(reconciled, providers)
721+
logger.info(
722+
"validate_org_github_handles: actions=%d in %.2fs",
723+
len(org_handle_warnings),
724+
perf_counter() - stage_started_at,
725+
)
726+
for warning in org_handle_warnings:
727+
_append_unique_warning(warnings, warning)
728+
709729
stage_started_at = perf_counter()
710730
strict_validation_entities = _iter_reconciled_entities(
711731
reconciled_entities=reconciled.entities,
@@ -1347,9 +1367,18 @@ async def health() -> V2HealthResponse:
13471367
except ValueError:
13481368
component_statuses["config"] = "unhealthy"
13491369

1350-
component_statuses["github_token"] = (
1351-
"healthy" if config and config.GITHUB_TOKEN else "degraded"
1352-
)
1370+
rate_limit_summary: GitHubRateLimitSummary | None = None
1371+
if config and config.GITHUB_TOKEN:
1372+
try:
1373+
rate_limit_summary = probe_github_rate_limit()
1374+
except Exception: # noqa: BLE001 — probe must never crash health
1375+
logger.exception("github rate-limit probe failed")
1376+
rate_limit_summary = None
1377+
component_statuses["github_token"] = (
1378+
rate_limit_summary.status if rate_limit_summary is not None else "degraded"
1379+
)
1380+
else:
1381+
component_statuses["github_token"] = "degraded"
13531382

13541383
overall_status: Literal["healthy", "degraded", "unhealthy"]
13551384
if "unhealthy" in component_statuses.values():
@@ -1363,4 +1392,5 @@ async def health() -> V2HealthResponse:
13631392
status=overall_status,
13641393
components=component_statuses,
13651394
version=PACKAGE_VERSION,
1395+
github_rate_limit=rate_limit_summary,
13661396
)

0 commit comments

Comments
 (0)