Skip to content

Commit 17ff486

Browse files
committed
fix(v2 reconciliation): merge duplicate memberships instead of dropping (#30/#34)
When two Membership entities canonicalised to the same `personId__orgId` the second one was silently dropped, so the duplicate that survived was non-deterministic and the alternative `org:role` value (e.g. `Ph.D Student` vs `PhD Student` for the same person at the same org) was lost or — when both still leaked through — appeared twice in the JSON-LD output. Reported by rmfranken in #30 / #34. Replace the "first wins, drop the rest" sweep with a per-canonical-id buffer that fuses the group: - `org:role`: drop nulls, then if all surviving roles normalise to the same value (lowercase + strip dots + collapse whitespace) keep the longest variant (`Ph.D Student` survives over `PhD Student`); if the surviving roles disagree after normalisation, pick the one attached to the entry with the most-recent `time:hasEnd` (fallback `time:hasBeginning`) per the task list in #34. - `time:hasBeginning`: earliest non-null value across the group. - `time:hasEnd`: latest non-null value across the group. - All other fields fall back to the first entry, which preserves the agent's identifier/uuid choices. A warning is emitted whenever a group of >1 collapses so the merge is visible in the run audit trail. Inline tests cover the issue's exact example (Ph.D vs PhD with identical dates → single output with `Ph.D Student`), career progression (`PhD Student 2013-2018` + `Postdoc 2018-2020` → `Postdoc 2013-2020`), and the single-membership passthrough. Existing reconciliation/membership test suite stays green (71/71).
1 parent 3c7465f commit 17ff486

1 file changed

Lines changed: 82 additions & 6 deletions

File tree

src/v2/pipeline/stages/reconciliation.py

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1296,16 +1296,64 @@ def _extract_composite_pair(composite_id: Any) -> tuple[str | None, str | None]:
12961296
return left, right
12971297

12981298

1299+
def _normalize_role_value(role: Any) -> str:
1300+
"""Lowercase + strip dots + collapse whitespace, for role equality.
1301+
1302+
Treats `Ph.D Student`, `PhD Student`, and `phd student` as equivalent
1303+
so they merge into one canonical role rather than three rows in the
1304+
output (issue #30 / #34).
1305+
"""
1306+
if not isinstance(role, str):
1307+
return ""
1308+
cleaned = role.replace(".", "").lower()
1309+
return re.sub(r"\s+", " ", cleaned).strip()
1310+
1311+
1312+
def _pick_membership_role(roles: list[tuple[str | None, str | None, str | None]]) -> str | None:
1313+
"""Pick the canonical role from a list of `(role, hasBeginning, hasEnd)` tuples.
1314+
1315+
Policy (rmfranken issue #34 task list):
1316+
1. Drop null / empty roles.
1317+
2. If all remaining roles normalize to the same value (`_normalize_role_value`),
1318+
pick the longest (preserves richer punctuation/case — "Ph.D Student"
1319+
beats "PhD Student" on length).
1320+
3. Otherwise, pick the role from the entry with the most recent
1321+
`time:hasEnd` (fallback `time:hasBeginning`). Null dates rank lowest.
1322+
"""
1323+
valid = [(r, b, e) for (r, b, e) in roles if isinstance(r, str) and r.strip()]
1324+
if not valid:
1325+
return None
1326+
if len({_normalize_role_value(r) for r, _, _ in valid}) == 1:
1327+
return max((r for r, _, _ in valid), key=len)
1328+
# most-recent-dated wins; sort key prefers entries with hasEnd, then hasBeginning.
1329+
def _date_key(entry: tuple[str | None, str | None, str | None]) -> tuple[str, str]:
1330+
_, begin, end = entry
1331+
return (end or "", begin or "")
1332+
valid.sort(key=_date_key, reverse=True)
1333+
return valid[0][0]
1334+
1335+
1336+
def _merge_membership_dates(
1337+
dates: list[tuple[str | None, str | None]],
1338+
) -> tuple[str | None, str | None]:
1339+
"""Earliest non-null hasBeginning, latest non-null hasEnd."""
1340+
begins = sorted(b for b, _ in dates if isinstance(b, str) and b)
1341+
ends = sorted((e for _, e in dates if isinstance(e, str) and e), reverse=True)
1342+
return (begins[0] if begins else None, ends[0] if ends else None)
1343+
1344+
12991345
def _normalize_membership_entities(
13001346
memberships: list[dict[str, Any]],
13011347
*,
13021348
person_lookup: dict[str, str],
13031349
organization_lookup: dict[str, str],
13041350
) -> tuple[list[dict[str, Any]], set[tuple[str, str]], list[str]]:
1305-
normalized_memberships: list[dict[str, Any]] = []
1351+
# Buffer by canonical id so duplicates (issue #30/#34) merge their role
1352+
# and date fields instead of one silently shadowing the other.
1353+
buckets: dict[str, list[dict[str, Any]]] = {}
1354+
bucket_order: list[str] = []
13061355
covered_pairs: set[tuple[str, str]] = set()
13071356
warnings: list[str] = []
1308-
seen_membership_ids: set[str] = set()
13091357

13101358
for membership in memberships:
13111359
composite_id_ref: Any = membership.get("id")
@@ -1333,9 +1381,6 @@ def _normalize_membership_entities(
13331381
# convention so the @id round-trips through `_extract_composite_pair`
13341382
# even when personId or orgId contain `_` (GitHub usernames may).
13351383
canonical_membership_id = f"{canonical_person_id}__{canonical_org_id}"
1336-
if canonical_membership_id in seen_membership_ids:
1337-
continue
1338-
seen_membership_ids.add(canonical_membership_id)
13391384
covered_pairs.add((canonical_person_id, canonical_org_id))
13401385

13411386
normalized_membership = deepcopy(membership)
@@ -1363,7 +1408,38 @@ def _normalize_membership_entities(
13631408
if not isinstance(normalized_membership.get("time:hasEnd"), str):
13641409
normalized_membership["time:hasEnd"] = None
13651410

1366-
normalized_memberships.append(normalized_membership)
1411+
if canonical_membership_id not in buckets:
1412+
buckets[canonical_membership_id] = []
1413+
bucket_order.append(canonical_membership_id)
1414+
buckets[canonical_membership_id].append(normalized_membership)
1415+
1416+
normalized_memberships: list[dict[str, Any]] = []
1417+
for canonical_id in bucket_order:
1418+
group = buckets[canonical_id]
1419+
if len(group) == 1:
1420+
normalized_memberships.append(group[0])
1421+
continue
1422+
1423+
# Merge >1 duplicates per issue #30 / #34. First entry wins for
1424+
# opaque fields (uuid, identifiers); role + dates use the
1425+
# resolution policies above.
1426+
base = group[0]
1427+
roles = [
1428+
(m.get("org:role"), m.get("time:hasBeginning"), m.get("time:hasEnd"))
1429+
for m in group
1430+
]
1431+
dates = [(m.get("time:hasBeginning"), m.get("time:hasEnd")) for m in group]
1432+
merged_role = _pick_membership_role(roles)
1433+
merged_begin, merged_end = _merge_membership_dates(dates)
1434+
base["org:role"] = merged_role
1435+
base["time:hasBeginning"] = merged_begin
1436+
base["time:hasEnd"] = merged_end
1437+
warnings.append(
1438+
f"Merged {len(group)} duplicate memberships at {canonical_id} "
1439+
f"(roles seen: {sorted({r for r, _, _ in roles if r})!r} → "
1440+
f"{merged_role!r}).",
1441+
)
1442+
normalized_memberships.append(base)
13671443

13681444
return normalized_memberships, covered_pairs, warnings
13691445

0 commit comments

Comments
 (0)