Skip to content

Commit 2ff9b6f

Browse files
committed
fix(v2 output_assembly): final-pass schema:url self-loop sweep
Empirical batch 4 (post-b3101f8) showed Bug P self-loops STILL materialising on non-EPFL Persons across 4 separate runs (pyffs 1/1, detect-libc 3/8, gimie 3/10, pandoc 14/31). The pattern: zero self-loops on EPFL-affiliated repos, all of them on github-only contributors. Root cause: the LLM person agent emits `payload["id"] = "<urn-or-name>"` and `payload["schema:url"] = "https://github.com/<login>"` separately; my agent-level guard from 97285c1 ran AT THE AGENT, where those two fields didn't match yet. Downstream canonicalisation then resolved `id` to the github URL — and the self-loop materialised AFTER all the agent-level guards had already run. Fix at `output_assembly`, which is the first stage where every entity is in its final canonical form (post-id-resolution, pre-jsonld-build). `_drop_person_url_self_loops()` walks Person entities, unwraps both the bare-string and `{"@id": ...}` shapes of `schema:url`, and pops the field when the value equals the resolved `id`. Applied to both the root entity and `related_entities`. Mutates in place — the upstream payloads have already been deep-copied at this point. Companion: regenerated `tests/v2/golden/extract/*.json` because the octocat user/Person fixture had a self-loop the sweep now drops.
1 parent b3101f8 commit 2ff9b6f

3 files changed

Lines changed: 47 additions & 6 deletions

File tree

src/v2/pipeline/stages/output_assembly.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ def assemble_output( # noqa: C901, PLR0912
230230
related_entities, dedup_warnings = _merge_duplicate_entities(related_entities)
231231
warnings.extend(dedup_warnings)
232232

233+
# Person `schema:url` self-loop drop — final canonical-form sweep.
234+
# The LLM person agent's own guard runs BEFORE canonicalization
235+
# has rewritten `id`, so when the LLM emits
236+
# `"id": "<urn>", "schema:url": "https://github.com/X"` the
237+
# equality check fails at agent time. Downstream stages then
238+
# resolve `id` to the github URL and the self-loop materialises in
239+
# the final graph. Re-check here, after every id has been
240+
# canonicalised, and drop the redundant url. Production audit
241+
# observed 14 self-loops in `gabyx/pandoc` alone after the
242+
# agent-level fixes shipped.
243+
_drop_person_url_self_loops(related_entities)
244+
if isinstance(root_entity, dict):
245+
_drop_person_url_self_loops([root_entity])
246+
233247
# Article ↔ Repo linkback. Production audit (Bug V) found
234248
# `schema:citation` literally `null` in 441/441 repos, including
235249
# the 19 repos that successfully extracted ``schema:ScholarlyArticle``
@@ -256,6 +270,38 @@ def assemble_output( # noqa: C901, PLR0912
256270
)
257271

258272

273+
def _drop_person_url_self_loops(entities: list[dict[str, Any]]) -> None:
274+
"""Mutate Persons in-place to drop `schema:url` self-loops.
275+
276+
A self-loop is `schema:url == id` (in either the bare-string or
277+
`{"@id": ...}` form). Mutates the list in place since this stage
278+
already deep-copies upstream payloads.
279+
"""
280+
for entity in entities:
281+
if not isinstance(entity, dict):
282+
continue
283+
type_value = entity.get("type") or entity.get("@type")
284+
if isinstance(type_value, list):
285+
is_person = any("schema:Person" in str(t) for t in type_value)
286+
else:
287+
is_person = isinstance(type_value, str) and "schema:Person" in type_value
288+
if not is_person:
289+
continue
290+
pid = entity.get("id") or entity.get("@id")
291+
if not isinstance(pid, str):
292+
continue
293+
url = entity.get("schema:url")
294+
target: str | None = None
295+
if isinstance(url, str):
296+
target = url.strip()
297+
elif isinstance(url, dict):
298+
inner = url.get("@id") or url.get("id")
299+
if isinstance(inner, str):
300+
target = inner.strip()
301+
if target is not None and target == pid.strip():
302+
entity.pop("schema:url", None)
303+
304+
259305
def _link_articles_to_root_repo(
260306
root_entity: Any,
261307
related_entities: list[dict[str, Any]],

tests/v2/golden/extract/repo_github_com_owner_repo.json

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,6 @@
114114
"@id": "https://github.com/octocat",
115115
"@type": "schema:Person",
116116
"schema:name": "The Octocat",
117-
"schema:url": {
118-
"@id": "https://github.com/octocat"
119-
},
120117
"pulse:githubUsername": "octocat",
121118
"pulse:orcidIdentifier": null,
122119
"pulse:infosciencePersonIdentifier": null,
@@ -178,7 +175,7 @@
178175
"warnings": [],
179176
"stats": {
180177
"entities_count": 3,
181-
"triples_count": 21,
178+
"triples_count": 20,
182179
"stages_completed": [
183180
"context_gather",
184181
"repo_agent",

tests/v2/golden/extract/user_github_com_username.json

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
},
1515
"idSource": "pulse:githubUsername",
1616
"schema:name": "The Octocat",
17-
"schema:url": "https://github.com/octocat",
1817
"pulse:githubUsername": "octocat",
1918
"pulse:orcidIdentifier": null,
2019
"pulse:infosciencePersonIdentifier": null,
@@ -37,7 +36,6 @@
3736
},
3837
"idSource": "pulse:githubUsername",
3938
"schema:name": "The Octocat",
40-
"schema:url": "https://github.com/octocat",
4139
"pulse:githubUsername": "octocat",
4240
"pulse:orcidIdentifier": null,
4341
"pulse:infosciencePersonIdentifier": null,

0 commit comments

Comments
 (0)