Skip to content

Commit 3c7465f

Browse files
committed
fix(v2): demote GitHub-derived props to org units + RCP_TOKEN round-robin
Two independent fixes bundled per the live debugging session. 1. **`demote_github_props_to_units` stage** addressing Imaging-Plaza/git-metadata-extractor issues #29 and #33: When an Organization had both a ROR identifier (the legal entity) and a GitHub presence linked via `org:hasUnit → github_url`, the rule-based agent stamped the GitHub-derived properties (`pulse:githubOrgFollowers`, `pulse:githubOrganizationHandle`) on the ROR parent rather than the unit. Real-world example from the audit: `ror.org/0070nx673` (Okino) was carrying followers=364 that actually belong to its unit `github.com/InteractiveComputerGraphics`. Add a post-pass stage that, only when the ROR parent's handle matches the github-only unit's handle (so the data really is the unit's), moves the follower count to the unit if the unit lacks it and clears the GitHub-derived properties on the parent. The match guard prevents stripping legitimate data when the parent is its own independent GitHub presence with separate units. Wired immediately after `infer_org_units` in the api.py orchestrator so the unit relationships are stamped before this pass runs. 2. **Multi-token `RCP_TOKEN` support** mirroring the existing `GITHUB_TOKEN` pattern. Set `RCP_TOKEN=sk-A,sk-B` and every model instantiation pulls the next key via `itertools.cycle` under a threading lock. Doubles the per-process rate-limit budget when the shared inference endpoint pushes back on bursty extracts. Independent cycles per env var name so distinct providers (`RCP_TOKEN`, `OPENAI_API_KEY`, ...) don't share state. Both changes covered by inline synthetic tests (issue #29 example demote roundtrip + round-robin sequencing + single-token passthrough).
1 parent 2c6e8b1 commit 3c7465f

5 files changed

Lines changed: 166 additions & 2 deletions

File tree

.env.example

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,11 @@ GITHUB_TOKEN=your-github-token-here
3434
# and surfaces a clear error if the configured profile's env var is missing).
3535
# ---------------------------------------------------------------------------
3636

37-
# EPFL RCP inference endpoint token.
37+
# EPFL RCP inference endpoint token. Comma-separated list of tokens is
38+
# supported (`sk-A,sk-B`) — model_config.py round-robins across them at
39+
# every model instantiation, which doubles the effective per-process
40+
# rate-limit budget when more than one token is provided. Same pattern
41+
# as `GITHUB_TOKEN`.
3842
RCP_TOKEN=your-rcp-token-here
3943

4044
# OpenAI API key (only if a model profile is configured for OpenAI).

src/v1/llm/model_config.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,46 @@
55
Supports OpenAI, OpenRouter, OpenAI-compatible endpoints, and Ollama (local and remote).
66
"""
77

8+
import itertools
89
import json
910
import logging
1011
import os
12+
import threading
1113
from typing import Any, Dict, List
1214

1315
logger = logging.getLogger(__name__)
1416

17+
_API_KEY_LOCK = threading.Lock()
18+
_API_KEY_CYCLES: Dict[str, "itertools.cycle"] = {}
19+
_API_KEY_SOURCES: Dict[str, str] = {}
20+
21+
22+
def _next_api_key(env_var: str) -> str:
23+
"""Round-robin a single token out of a comma-separated env value.
24+
25+
Supports the same multi-token pattern as `GITHUB_TOKEN`
26+
(`ghp_A,ghp_B,...`): set `RCP_TOKEN=sk-A,sk-B` and every LLM model
27+
instantiation pulls the next key. Locked per-env-var so different
28+
providers (`RCP_TOKEN`, `OPENAI_API_KEY`, ...) keep independent cycles.
29+
Per-process cycle — with multiple uvicorn workers the rotation is
30+
independent per worker, but the overall call volume splits roughly
31+
evenly across tokens.
32+
"""
33+
raw = os.getenv(env_var) or ""
34+
if "," not in raw:
35+
return raw.strip()
36+
with _API_KEY_LOCK:
37+
cached_source = _API_KEY_SOURCES.get(env_var)
38+
if raw != cached_source or env_var not in _API_KEY_CYCLES:
39+
tokens = [t.strip() for t in raw.split(",") if t.strip()]
40+
if not tokens:
41+
_API_KEY_CYCLES.pop(env_var, None)
42+
_API_KEY_SOURCES[env_var] = raw
43+
return ""
44+
_API_KEY_CYCLES[env_var] = itertools.cycle(tokens)
45+
_API_KEY_SOURCES[env_var] = raw
46+
return next(_API_KEY_CYCLES[env_var])
47+
1548
# Default model configurations
1649
MODEL_CONFIGS = {
1750
"run_llm_analysis": [
@@ -555,7 +588,7 @@ def create_pydantic_ai_model(config: Dict[str, Any]):
555588
model_name,
556589
provider=OpenAIProvider(
557590
base_url=base_url,
558-
api_key=os.getenv(api_key_env),
591+
api_key=_next_api_key(api_key_env),
559592
),
560593
)
561594
elif provider == "ollama":

src/v2/api.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
compute_stats,
9292
guarantee_repo_author,
9393
infer_github_handle_parents,
94+
demote_github_props_to_units,
9495
infer_org_units,
9596
infer_owners,
9697
promote_failed_id_entities,
@@ -1086,6 +1087,16 @@ async def extract( # noqa: C901, PLR0911, PLR0912, PLR0913, PLR0915
10861087
for warning in org_unit_warnings:
10871088
_append_unique_warning(warnings, warning)
10881089

1090+
stage_started_at = perf_counter()
1091+
assembled_output, demote_warnings = demote_github_props_to_units(assembled_output)
1092+
logger.info(
1093+
"demote_github_props_to_units: demoted=%d in %.2fs",
1094+
len(demote_warnings),
1095+
perf_counter() - stage_started_at,
1096+
)
1097+
for warning in demote_warnings:
1098+
_append_unique_warning(warnings, warning)
1099+
10891100
if _concept_tagging_is_enabled() and classification.detected_type.value == "repository":
10901101
repository_context = (
10911102
gathered_context.get("repository")

src/v2/pipeline/stages/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
build_json_output,
4242
)
4343
from src.v2.pipeline.stages.ownership_check import (
44+
demote_github_props_to_units,
4445
guarantee_repo_author,
4546
infer_github_handle_parents,
4647
infer_org_units,
@@ -80,6 +81,7 @@
8081
"compute_stats",
8182
"concept_tagging_is_enabled",
8283
"concept_tagging_resolve_backend",
84+
"demote_github_props_to_units",
8385
"gather_context",
8486
"guarantee_repo_author",
8587
"hybrid_refiner_is_enabled",

src/v2/pipeline/stages/ownership_check.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,7 +1269,121 @@ def _stamp_owner_contribution(person_id: str, repo_id: str) -> None:
12691269
return new_reconciled, warnings
12701270

12711271

1272+
def _strip_github_props(entity: dict[str, Any]) -> list[str]:
1273+
"""Remove GitHub-derived properties from `entity`. Returns the keys cleared."""
1274+
cleared: list[str] = []
1275+
for key in ("pulse:githubOrgFollowers", "pulse:githubOrganizationHandle"):
1276+
if entity.get(key) not in (None, ""):
1277+
entity[key] = None
1278+
cleared.append(key)
1279+
identifiers = entity.get("identifiers")
1280+
if isinstance(identifiers, dict):
1281+
if isinstance(identifiers.get("pulse:githubOrganizationHandle"), str):
1282+
identifiers["pulse:githubOrganizationHandle"] = None
1283+
cleared.append("identifiers.pulse:githubOrganizationHandle")
1284+
return cleared
1285+
1286+
1287+
def demote_github_props_to_units(
1288+
assembled: AssembledOutput,
1289+
) -> tuple[AssembledOutput, list[str]]:
1290+
"""Move `pulse:githubOrgFollowers` / `pulse:githubOrganizationHandle` off
1291+
ROR-id'd parents and onto the github-only unit they describe.
1292+
1293+
Fixes the data-shape bug reported in Imaging-Plaza/git-metadata-extractor
1294+
issue #29/#33: a ROR organization with `org:hasUnit → github_org` was
1295+
carrying the unit's follower count and handle on the legal entity. ROR
1296+
identifies a legal/research entity; GitHub-derived metrics belong on
1297+
the GitHub presence (the unit), not on the parent.
1298+
1299+
Trigger:
1300+
- Parent has a ROR id (`@id` or `pulse:ror` starts with `https://ror.org/`).
1301+
- Parent has at least one `org:hasUnit` child that is a github-only org
1302+
(`@id` starts with `https://github.com/`).
1303+
- The parent's `pulse:githubOrganizationHandle` matches the handle of one
1304+
of those children (so the parent's GitHub data really is the unit's).
1305+
1306+
Action:
1307+
- Strip the GitHub-derived properties from the parent (set to None).
1308+
- Copy the values onto the matching child only if the child is missing them
1309+
(never overwrite — the child is the canonical owner of these fields).
1310+
"""
1311+
new_root: dict[str, Any] | None = (
1312+
deepcopy(assembled.root_entity)
1313+
if isinstance(assembled.root_entity, dict)
1314+
else None
1315+
)
1316+
new_related: list[Any] = [
1317+
deepcopy(entity) if isinstance(entity, dict) else entity
1318+
for entity in assembled.related_entities
1319+
]
1320+
candidates: list[dict[str, Any]] = []
1321+
if new_root is not None:
1322+
candidates.append(new_root)
1323+
candidates.extend(e for e in new_related if isinstance(e, dict))
1324+
1325+
id_index: dict[str, dict[str, Any]] = {}
1326+
for entity in candidates:
1327+
if entity.get("type") != ORGANIZATION_TYPE:
1328+
continue
1329+
entity_id = entity.get("id")
1330+
if isinstance(entity_id, str) and entity_id:
1331+
id_index[entity_id] = entity
1332+
1333+
warnings: list[str] = []
1334+
1335+
for parent in candidates:
1336+
if parent.get("type") != ORGANIZATION_TYPE:
1337+
continue
1338+
parent_id = parent.get("id")
1339+
if not isinstance(parent_id, str) or not parent_id.startswith("https://ror.org/"):
1340+
continue
1341+
parent_handle = _entity_github_org_handle(parent)
1342+
if parent_handle is None:
1343+
continue
1344+
units = parent.get(HAS_UNIT_KEY)
1345+
if not isinstance(units, list) or not units:
1346+
continue
1347+
matched_child: dict[str, Any] | None = None
1348+
for unit_ref in units:
1349+
unit_id = unit_ref.get("@id") if isinstance(unit_ref, dict) else unit_ref
1350+
if not isinstance(unit_id, str):
1351+
continue
1352+
child = id_index.get(unit_id)
1353+
if child is None or not unit_id.startswith("https://github.com/"):
1354+
continue
1355+
child_handle = _entity_github_org_handle(child)
1356+
if child_handle is not None and child_handle == parent_handle:
1357+
matched_child = child
1358+
break
1359+
if matched_child is None:
1360+
continue
1361+
parent_followers = parent.get("pulse:githubOrgFollowers")
1362+
if isinstance(parent_followers, int) and not isinstance(
1363+
matched_child.get("pulse:githubOrgFollowers"), int
1364+
):
1365+
matched_child["pulse:githubOrgFollowers"] = parent_followers
1366+
cleared = _strip_github_props(parent)
1367+
if cleared:
1368+
warnings.append(
1369+
f"Demoted GitHub-derived properties from ROR parent {parent_id} "
1370+
f"to unit {matched_child.get('id')} (handle '{parent_handle}'): "
1371+
f"{', '.join(cleared)}.",
1372+
)
1373+
1374+
return (
1375+
AssembledOutput(
1376+
root_entity=new_root if new_root is not None else assembled.root_entity,
1377+
related_entities=new_related,
1378+
excluded_entities=list(assembled.excluded_entities),
1379+
warnings=list(assembled.warnings),
1380+
),
1381+
warnings,
1382+
)
1383+
1384+
12721385
__all__ = [
1386+
"demote_github_props_to_units",
12731387
"guarantee_repo_author",
12741388
"infer_github_handle_parents",
12751389
"infer_org_units",

0 commit comments

Comments
 (0)