@@ -72,6 +72,102 @@ def _list_of_dicts(value: Any, *, max_items: int = MAX_CONTEXT_ENTITIES) -> list
7272 return collected
7373
7474
75+ def _country_code_from_org (organization : Any ) -> str | None :
76+ """Best-effort 2-letter country code from an org payload.
77+
78+ Looks at common shapes produced by upstream stages: a bare
79+ ``country_code``, a nested ``country.country_code`` (ROR's own
80+ serialisation), or an ``addresses[0].country_code`` (also ROR).
81+ Case-normalised; returns ``None`` when no plausible code is found.
82+ """
83+ if not isinstance (organization , dict ):
84+ return None
85+ direct = organization .get ("country_code" )
86+ if isinstance (direct , str ) and len (direct .strip ()) == 2 :
87+ return direct .strip ().upper ()
88+ country = organization .get ("country" )
89+ if isinstance (country , dict ):
90+ nested = country .get ("country_code" ) or country .get ("code" )
91+ if isinstance (nested , str ) and len (nested .strip ()) == 2 :
92+ return nested .strip ().upper ()
93+ addresses = organization .get ("addresses" )
94+ if isinstance (addresses , list ) and addresses :
95+ first = addresses [0 ]
96+ if isinstance (first , dict ):
97+ code = first .get ("country_code" ) or first .get ("code" )
98+ if isinstance (code , str ) and len (code .strip ()) == 2 :
99+ return code .strip ().upper ()
100+ return None
101+
102+
103+ def _collect_allowed_org_ids (
104+ * ,
105+ target_organizations : list [dict [str , Any ]],
106+ known_organizations : Any ,
107+ organization_derivations : Any ,
108+ ) -> set [str ]:
109+ """Return the set of org `@id`s the LLM is allowed to point Memberships at.
110+
111+ The membership agent's job is to express employment/affiliation for a
112+ target person inside the graph that other agents have already built.
113+ A Membership target outside this set is, in practice, an LLM-invented
114+ cross-reference (typically from `query_orcid` returning a name-fuzzy
115+ hit at an unrelated company). Returns an empty set when no
116+ organizations have been surfaced at all — then the filter degrades
117+ open (lets the LLM's choice through) so we don't strangle deployments
118+ that genuinely run without an org-detection upstream.
119+ """
120+ allowed : set [str ] = set ()
121+ for org in target_organizations :
122+ if isinstance (org , dict ):
123+ oid = org .get ("id" ) or org .get ("@id" )
124+ if isinstance (oid , str ) and oid .strip ():
125+ allowed .add (oid .strip ())
126+ for collection in (known_organizations , organization_derivations ):
127+ if not isinstance (collection , list ):
128+ continue
129+ for org in collection :
130+ if isinstance (org , dict ):
131+ oid = org .get ("id" ) or org .get ("@id" )
132+ if isinstance (oid , str ) and oid .strip ():
133+ allowed .add (oid .strip ())
134+ return allowed
135+
136+
137+ def _infer_target_country_code (
138+ * ,
139+ target_organizations : list [dict [str , Any ]],
140+ repository_context : Any ,
141+ pipeline_outputs : Any ,
142+ ) -> str | None :
143+ """Pick the country code the LLM should default-bias toward.
144+
145+ Priority: explicit `target_organizations[*].country_code` >
146+ repository's owning-org country (from `repository_context.owning_org`
147+ or `pipeline_outputs.organization.country_code`). Returns ``None``
148+ when nothing is grounded — the LLM then has no bias and falls back
149+ to the prompt's general counter-rules without a CH default.
150+ """
151+ for org in target_organizations :
152+ code = _country_code_from_org (org )
153+ if code :
154+ return code
155+ if isinstance (repository_context , dict ):
156+ owning = repository_context .get ("owning_org" ) or repository_context .get ("owner" )
157+ code = _country_code_from_org (owning )
158+ if code :
159+ return code
160+ if isinstance (pipeline_outputs , dict ):
161+ org_payload = pipeline_outputs .get ("organization" ) or pipeline_outputs .get (
162+ "owning_organization" ,
163+ )
164+ if isinstance (org_payload , dict ):
165+ code = _country_code_from_org (org_payload )
166+ if code :
167+ return code
168+ return None
169+
170+
75171class LLMMembershipAgentV2 :
76172 """LLM-backed agent that produces an org:Membership entity."""
77173
@@ -136,6 +232,21 @@ async def run(
136232 if isinstance (pipeline_outputs , dict ) and pipeline_outputs :
137233 llm_input ["pipeline_outputs" ] = pipeline_outputs
138234
235+ # Country prior derived from the repo's owning org. Without
236+ # this, the LLM happily stamps Memberships to RaySearch (SE),
237+ # 10X Genomics (SE), Volvo Cars (SE), Spotify (SE), Statistics
238+ # Botswana, etc. for GitHub contributors of EPFL/SDSC repos
239+ # whose usernames merely *resemble* unrelated company slugs.
240+ # The prompt's "Counter-rules" section instructs the model to
241+ # default-reject candidate orgs whose ROR country differs.
242+ target_country_code = _infer_target_country_code (
243+ target_organizations = normalized_target_organizations ,
244+ repository_context = repository_context ,
245+ pipeline_outputs = pipeline_outputs ,
246+ )
247+ if target_country_code :
248+ llm_input ["target_country_code" ] = target_country_code
249+
139250 context_json = json .dumps (llm_input , ensure_ascii = True , sort_keys = True , default = str )
140251 user_prompt = _USER_PROMPT_TEMPLATE .replace ("{context_json}" , context_json )
141252 user_prompt = append_runtime_prompt_context (user_prompt , context )
@@ -198,6 +309,56 @@ def _safe_date(value: object) -> str | None:
198309 if beg and end and beg > end :
199310 payload ["time:hasBeginning" ], payload ["time:hasEnd" ] = end , beg
200311
312+ # Spurious-membership filter — code-level enforcement of the
313+ # emit-or-skip prompt rules. The LLM was observed ignoring the
314+ # counter-rules in `system_prompt.md`: it still stamps
315+ # Memberships to ROR orgs that only matched via a `query_orcid`
316+ # name fuzzy hit (e.g. `jamalsenouci` (GitHub contributor of an
317+ # EPFL repo) → Spotify (ror.org/00hbd6420) just because some
318+ # *other* "Jamal Senouci" works at Spotify). Reject any
319+ # Membership whose target org isn't reachable from the
320+ # context — the cost of a false negative (legitimate ORCID
321+ # employment dropped because the org wasn't surfaced upstream)
322+ # is far smaller than the false-positive misattribution noise.
323+ target_org_ref = payload .get ("org:organization" )
324+ if isinstance (target_org_ref , dict ):
325+ target_org_id = target_org_ref .get ("@id" ) or target_org_ref .get ("id" )
326+ else :
327+ target_org_id = target_org_ref
328+ if isinstance (target_org_id , str ) and target_org_id .strip ():
329+ allowed_org_ids = _collect_allowed_org_ids (
330+ target_organizations = normalized_target_organizations ,
331+ known_organizations = llm_input .get ("known_organizations" , []),
332+ organization_derivations = llm_input .get ("organization_derivations" , []),
333+ )
334+ if allowed_org_ids and target_org_id .strip () not in allowed_org_ids :
335+ warning = (
336+ f"llm_membership: dropping Membership { payload .get ('id' )!r} — "
337+ f"target org { target_org_id !r} not present in known_organizations "
338+ "(likely a spurious `query_orcid` name match; the LLM ignored "
339+ "the emit-or-skip rules in the system prompt)."
340+ )
341+ logger .info (warning )
342+ return AgentResult (
343+ data = {},
344+ warnings = [warning ],
345+ raw_output = {},
346+ model = llm_result .model ,
347+ provider = llm_result .provider ,
348+ tokens_prompt = llm_result .tokens_prompt ,
349+ tokens_completion = llm_result .tokens_completion ,
350+ stats = {
351+ "agent_runtime" : "llm" ,
352+ "memberships" : [],
353+ "membership_count" : 0 ,
354+ "derivation" : {
355+ "membership_seed" : membership_seed ,
356+ "skipped_reason" : "org_not_in_context" ,
357+ "target_org_id" : target_org_id ,
358+ },
359+ },
360+ )
361+
201362 raw_output = deepcopy (payload )
202363 validation_warnings = _strict_validate (payload )
203364
0 commit comments