From f7cc1b1175c86f92923ff6661970d0f0ae47027e Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Wed, 6 May 2026 16:17:52 +0200 Subject: [PATCH 01/23] feat: ObservationRow class --- src/omop_etl/omop/builders/observation.py | 2 ++ src/omop_etl/omop/models/rows.py | 28 +++++++++++++++++++ .../omop/builders/test_observation_builder.py | 0 3 files changed, 30 insertions(+) create mode 100644 src/omop_etl/omop/builders/observation.py create mode 100644 tests/omop/builders/test_observation_builder.py diff --git a/src/omop_etl/omop/builders/observation.py b/src/omop_etl/omop/builders/observation.py new file mode 100644 index 0000000..6d35dfd --- /dev/null +++ b/src/omop_etl/omop/builders/observation.py @@ -0,0 +1,2 @@ +# cdm spec +# diff --git a/src/omop_etl/omop/models/rows.py b/src/omop_etl/omop/models/rows.py index 41af420..cd3bf8c 100644 --- a/src/omop_etl/omop/models/rows.py +++ b/src/omop_etl/omop/models/rows.py @@ -243,3 +243,31 @@ class MeasurementRow: def validate(self): validate_required_fields(self) + + +@pd_dataclass(frozen=True, slots=True) +class ObservationRow: + observation_id: int + person_id: int + observation_concept_id: int + observation_date: dt.date + observation_type_concept_id: int + observation_datetime: dt.datetime | None = None + value_as_number: float | None = None + value_as_string: Annotated[str | None, pd_field(max_length=60)] = None + value_as_concept_id: int | None = None + qualifier_concept_id: int | None = None + unit_concept_id: int | None = None + provider_id: int | None = None + visit_occurrence_id: int | None = None + visit_detail_id: int | None = None + observation_source_value: Annotated[str | None, pd_field(max_length=50)] = None + observation_source_concept_id: int | None = None + unit_source_value: Annotated[str | None, pd_field(max_length=50)] = None + qualifier_source_value: Annotated[str | None, pd_field(max_length=50)] = None + value_source_value: Annotated[str | None, pd_field(max_length=50)] = None + observation_event_id: int | None = None + obs_event_field_concept_id: int | None = None + + def validate(self): + validate_required_fields(self) diff --git a/tests/omop/builders/test_observation_builder.py b/tests/omop/builders/test_observation_builder.py new file mode 100644 index 0000000..e69de29 From 7d464b27d4e6c863601abbb1914ad891ccd08413 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Thu, 7 May 2026 12:08:08 +0200 Subject: [PATCH 02/23] feat: device exposure row, observation builder deferred until other domain builder completed --- src/omop_etl/omop/builders/observation.py | 60 ++++++++++++++++++++++- src/omop_etl/omop/models/rows.py | 26 ++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/src/omop_etl/omop/builders/observation.py b/src/omop_etl/omop/builders/observation.py index 6d35dfd..399dd6d 100644 --- a/src/omop_etl/omop/builders/observation.py +++ b/src/omop_etl/omop/builders/observation.py @@ -1,2 +1,60 @@ # cdm spec -# + +# so this table should contain all observations that are not populating: +# measurement, drug exposure, condition, procedure occurrence, device occurrence, condition occurrence etc. +# they *cannot* be of domain: Condition, Procedure, Drug, Specimen, Measurement or Device. + +# so from the patient data this leaves what domain classes / patient scalars, and what fields and domains? +# think optimally do this table last, easier that way. + +# todo: implement condition and device builders first: +# anything that lands in condition, procedure, drug, measurement or device can't go in observation + +# anyways, can use: +# cohort_name, +# evaluable_for_efficacy_analysis +# has_any_adverse_events +# number_of_adverse_events +# number_of_serious_adverse_events +# has_clinical_benefit_at_week_16 +# end_of_treatment_reason +# end_of_treatment_date +# lost_to_followup (bool, date) +# best_overall_response (but need non-measurement concepts) + +# maybe: +# AE outcome, AE was serious, AE related to treatment status, AE turned serious date, +# tumor assessment date of progression, nadir, etc? +# study drugs? + +""" +ObservationRow: + required fields: + observation_id: unique ID for each entry + person_id: person id from service + observation_concept_id: semantic mapped concept id + There is no specified domain that the Concepts in this table must adhere to. + The only rule is that records with Concepts in the Condition, Procedure, Drug, + Measurement, or Device domains MUST go to the corresponding table. + - so we grab + observation_date: date, required + observation_type_concept_id: + + optional fields: + observation_datetime: dt.datetime | None = None + value_as_number: float | None = None + value_as_string: Annotated[str | None, pd_field(max_length=60)] = None + value_as_concept_id: int | None = None + qualifier_concept_id: int | None = None + unit_concept_id: int | None = None + provider_id: int | None = None + visit_occurrence_id: int | None = None + visit_detail_id: int | None = None + observation_source_value: Annotated[str | None, pd_field(max_length=50)] = None + observation_source_concept_id: int | None = None + unit_source_value: Annotated[str | None, pd_field(max_length=50)] = None + qualifier_source_value: Annotated[str | None, pd_field(max_length=50)] = None + value_source_value: Annotated[str | None, pd_field(max_length=50)] = None + observation_event_id: int | None = None + obs_event_field_concept_id: int | None = None +""" diff --git a/src/omop_etl/omop/models/rows.py b/src/omop_etl/omop/models/rows.py index cd3bf8c..a558256 100644 --- a/src/omop_etl/omop/models/rows.py +++ b/src/omop_etl/omop/models/rows.py @@ -271,3 +271,29 @@ class ObservationRow: def validate(self): validate_required_fields(self) + + +@pd_dataclass(frozen=True, slots=True) +class DeviceExposureRow: + device_exposure_id: int + person_id: int + device_concept_id: int + device_exposure_start_date: dt.date + device_type_concept_id: int + device_exposure_start_datetime: dt.datetime | None = None + device_exposure_end_date: dt.date | None = None + device_exposure_end_datetime: dt.datetime | None = None + unique_device_id: Annotated[str | None, pd_field(max_length=255)] = None + production_id: Annotated[str | None, pd_field(max_length=255)] = None + quantity: int | None = None + provider_id: int | None = None + visit_occurrence_id: int | None = None + visit_detail_id: int | None = None + device_source_value: Annotated[str | None, pd_field(max_length=255)] = None + device_source_concept_id: int | None = None + unit_concept_id: int | None = None + unit_source_value: int | None = None + unit_source_concept_id: int | None = None + + def validate(self): + validate_required_fields(self) From 1ad87b99c1502cc3c475ef0c90271b5a9fea9776 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Fri, 8 May 2026 17:13:21 +0200 Subject: [PATCH 03/23] feat: sequence_id on AdverseEvent domain --- src/omop_etl/harmonization/harmonizers/impress.py | 3 +++ .../harmonization/models/domain/adverse_event.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/omop_etl/harmonization/harmonizers/impress.py b/src/omop_etl/harmonization/harmonizers/impress.py index bea4852..b86e529 100644 --- a/src/omop_etl/harmonization/harmonizers/impress.py +++ b/src/omop_etl/harmonization/harmonizers/impress.py @@ -1044,6 +1044,7 @@ def _process_adverse_events(self) -> pl.DataFrame | None: "AE_AESERCD", "AE_SAEEXP1CD", "AE_SAEEXP2CD", + "AE_AESPID", "FU_FUPDEDAT", "TR_TRNAME", "TR_TRTNO", @@ -1054,6 +1055,7 @@ def parse_events(frame: pl.DataFrame) -> pl.DataFrame: PolarsParsers.to_optional_date(pl.col("AE_AESTDAT")).alias(cols.START_DATE), PolarsParsers.to_optional_date(pl.col("AE_AEENDAT")).alias(cols.END_DATE), PolarsParsers.to_optional_date(pl.col("AE_SAESTDAT")).alias(cols.TURNED_SERIOUS_DATE), + PolarsParsers.to_optional_int64(pl.col("AE_AESPID")).alias(cols.SEQUENCE_ID), PolarsParsers.int_to_bool( true_int=1, false_int=0, @@ -1137,6 +1139,7 @@ def coerce(frame: pl.DataFrame) -> pl.DataFrame: cols.TREATMENT_2_NAME, cols.WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_1, cols.WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_2, + cols.SEQUENCE_ID, ) ) diff --git a/src/omop_etl/harmonization/models/domain/adverse_event.py b/src/omop_etl/harmonization/models/domain/adverse_event.py index b22100f..1d19f23 100644 --- a/src/omop_etl/harmonization/models/domain/adverse_event.py +++ b/src/omop_etl/harmonization/models/domain/adverse_event.py @@ -19,6 +19,7 @@ class Fields: OUTCOME = "outcome" START_DATE = "start_date" END_DATE = "end_date" + SEQUENCE_ID = "sequence_id" WAS_SERIOUS = "was_serious" TURNED_SERIOUS_DATE = "turned_serious_date" RELATED_TO_TREATMENT_1_STATUS = "related_to_treatment_1_status" @@ -37,6 +38,7 @@ def __init__(self, patient_id: str): self._outcome: str | None = None self._start_date: dt.date | None = None self._end_date: dt.date | None = None + self._sequence_id: int | None = None self._was_serious: bool | None = None self._turned_serious_date: dt.date | None = None self._related_to_treatment_1_status: RelatedStatus | None = None @@ -111,6 +113,18 @@ def end_date(self, value: dt.date | None) -> None: validator=StrictValidators.validate_optional_date, ) + @property + def sequence_id(self) -> int | None: + return self._sequence_id + + @sequence_id.setter + def sequence_id(self, value: int | None) -> None: + self._set_validated_prop( + prop=self.__class__.sequence_id, + value=value, + validator=StrictValidators.validate_optional_int, + ) + @property def was_serious(self) -> bool | None: return self._was_serious From 6e803cb2622afb084526c677b7514c14479f3799 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Fri, 8 May 2026 19:49:01 +0200 Subject: [PATCH 04/23] wip: refactoring base domains to use foreign keys, updating invariants, enforce sorting in instantiation, etc --- .../harmonization/harmonizers/impress.py | 12 ++++--- .../models/domain/adverse_event.py | 2 ++ .../harmonization/models/domain/base.py | 24 ++++++++++--- .../models/domain/medical_history.py | 6 +++- .../models/domain/previous_treatments.py | 20 ++++++++++- src/omop_etl/harmonization/models/patient.py | 10 +++--- src/omop_etl/omop/builders/base.py | 2 +- .../omop/builders/condition_occurrence.py | 22 +++++++++--- src/omop_etl/omop/builders/drug_exposure.py | 14 ++++---- .../omop/builders/procedure_occurrence.py | 14 ++++---- .../semantic_mapping/core/semantic_config.py | 6 ++-- tests/harmonization/conftest.py | 5 +++ .../harmonization/harmonizers/test_impress.py | 4 +-- .../models/test_schema_validation.py | 9 +++-- .../builders/test_drug_exposure_builder.py | 24 ++++++------- .../builders/test_procedure_occurrence.py | 34 +++++++++---------- tests/omop/test_service.py | 6 ++-- 17 files changed, 138 insertions(+), 76 deletions(-) diff --git a/src/omop_etl/harmonization/harmonizers/impress.py b/src/omop_etl/harmonization/harmonizers/impress.py index b86e529..39b781c 100644 --- a/src/omop_etl/harmonization/harmonizers/impress.py +++ b/src/omop_etl/harmonization/harmonizers/impress.py @@ -14,7 +14,7 @@ from omop_etl.harmonization.models.domain.eq5d import EQ5D from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment import TumorAssessment @@ -719,9 +719,9 @@ def merge_medical_history(base: pl.DataFrame, processed: pl.DataFrame) -> pl.Dat return merged - @collection(PreviousTreatments, order_by=("start_date",), require_order_by=True) + @collection(PreviousTreatment, order_by=("start_date",), require_order_by=True) def _process_previous_treatments(self) -> pl.DataFrame | None: - cols = PreviousTreatments.Fields + cols = PreviousTreatment.Fields ct_base = self.data.select( "SubjectId", "CT_CTTYPE", @@ -1024,7 +1024,11 @@ def filter_concomitant_data(frame: pl.DataFrame) -> pl.DataFrame: return filtered - @collection(AdverseEvent, order_by=("start_date",), require_order_by=True) + @collection( + AdverseEvent, + order_by=("start_date", "sequence_id"), + require_order_by=True, + ) def _process_adverse_events(self) -> pl.DataFrame | None: cols = AdverseEvent.Fields ae_base = self.data.select( diff --git a/src/omop_etl/harmonization/models/domain/adverse_event.py b/src/omop_etl/harmonization/models/domain/adverse_event.py index 1d19f23..7f79b12 100644 --- a/src/omop_etl/harmonization/models/domain/adverse_event.py +++ b/src/omop_etl/harmonization/models/domain/adverse_event.py @@ -30,6 +30,7 @@ class Fields: WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_2 = "was_serious_grade_expected_treatment_2" INVARIANT_FIELDS = (Fields.TERM,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID, Fields.TERM) def __init__(self, patient_id: str): self._patient_id = patient_id @@ -232,6 +233,7 @@ def __repr__(self, delim=",") -> str: f"outcome={self.outcome!r}{delim} " f"start_date={self.start_date!r}{delim} " f"end_date={self.end_date!r}{delim} " + f"sequence_id={self.sequence_id!r}{delim}" f"was_serious={self.was_serious!r}{delim} " f"turned_serious_date={self.turned_serious_date!r}{delim} " f"related_to_treatment_1_status={self.related_to_treatment_1_status!r}{delim} " diff --git a/src/omop_etl/harmonization/models/domain/base.py b/src/omop_etl/harmonization/models/domain/base.py index 5a07e25..120950d 100644 --- a/src/omop_etl/harmonization/models/domain/base.py +++ b/src/omop_etl/harmonization/models/domain/base.py @@ -9,18 +9,20 @@ class DomainBase(TrackedValidated, ABC): Base class for all domain models with schema contract support. Subclasses must define: - - `class Fields:` with string constants for canonical field names (wire schema) + - `class Fields:` with string constants for canonical field names (schema from processor to domain) Subclasses may optionally define: - - INVARIANT_FIELDS` tuple referencing Fields constants for materiality (the domains' invariants) filtering + - `INVARIANT_FIELDS` tuple referencing Fields constants for materiality (the domains' invariants) filtering + - `NATURAL_KEY_FIELDS` tuple referencing Fields that make up the natural key for the domain subclass """ # internal cache, use data_fields() method to access _data_fields: ClassVar[tuple[str, ...] | None] = None _schema_validated: ClassVar[bool] = False - # optional + # collection and singleton subclasses override INVARIANT_FIELDS: ClassVar[tuple[str, ...]] = () + NATURAL_KEY_FIELDS: ClassVar[tuple[str, ...]] = () @abstractmethod def __init__(self, patient_id: str) -> None: # noqa @@ -33,6 +35,12 @@ def __init_subclass__(cls, **kwargs: Any) -> None: cls._data_fields = None cls._schema_validated = False + def natural_key(self) -> tuple: + return tuple(getattr(self, f) for f in self.NATURAL_KEY_FIELDS) + + def invariant_fields(self) -> tuple: + return tuple(getattr(self, f) for f in self.INVARIANT_FIELDS) + @classmethod def _derive_data_fields(cls) -> tuple[str, ...]: """Derive data fields from Fields inner class string constants.""" @@ -63,9 +71,15 @@ def _ensure_schema(cls) -> None: if len(fields) != len(set(fields)): raise ValueError(f"{cls.__name__}.data_fields has duplicates") + field_set = set(fields) + invariant = set(cls.INVARIANT_FIELDS) - if invariant and not invariant.issubset(set(fields)): - raise ValueError(f"{cls.__name__}.INVARIANT_FIELDS not subset of data_fields: {invariant - set(fields)}") + if invariant and not invariant.issubset(field_set): + raise ValueError(f"{cls.__name__}.INVARIANT_FIELDS not a subset of data_fields: {invariant - field_set}") + + natural_key = set(cls.NATURAL_KEY_FIELDS) + if natural_key and not natural_key.issubset(field_set): + raise ValueError(f"{cls.__name__}.NATURAL_KEY_FIELDS not a subset of data_fields: {natural_key - field_set}") # validate every Fields value matches an actual property on the class fields_cls = getattr(cls, "Fields", None) diff --git a/src/omop_etl/harmonization/models/domain/medical_history.py b/src/omop_etl/harmonization/models/domain/medical_history.py index 3350c09..c16a8b6 100644 --- a/src/omop_etl/harmonization/models/domain/medical_history.py +++ b/src/omop_etl/harmonization/models/domain/medical_history.py @@ -17,13 +17,16 @@ class Fields: def __init__(self, patient_id: str): self._patient_id = patient_id self._term: str | None = None - self._sequence_id: int | None = None self._start_date: dt.date | None = None self._end_date: dt.date | None = None + self._sequence_id: int | None = None self._status: str | None = None self._status_code: int | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.TERM,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID, Fields.TERM) + @property def patient_id(self) -> str: return self._patient_id @@ -107,6 +110,7 @@ def __repr__(self, delim=",") -> str: f"seq={self.sequence_id!r}{delim} " f"start={self.start_date!r}{delim} " f"end={self.end_date!r}{delim} " + f"sequence_id={self.sequence_id!r}{delim} " f"status={self.status!r}{delim} " f"code={self.status_code!r})" ) diff --git a/src/omop_etl/harmonization/models/domain/previous_treatments.py b/src/omop_etl/harmonization/models/domain/previous_treatments.py index c9131f4..f6e657f 100644 --- a/src/omop_etl/harmonization/models/domain/previous_treatments.py +++ b/src/omop_etl/harmonization/models/domain/previous_treatments.py @@ -5,7 +5,7 @@ from omop_etl.harmonization.models.domain.base import DomainBase -class PreviousTreatments(DomainBase): +class PreviousTreatment(DomainBase): class Fields: TREATMENT = "treatment" TREATMENT_CODE = "treatment_code" @@ -13,6 +13,7 @@ class Fields: START_DATE = "start_date" END_DATE = "end_date" ADDITIONAL_TREATMENT = "additional_treatment" + SEQUENCE_ID = "sequence_id" def __init__(self, patient_id: str): self._patient_id = patient_id @@ -22,8 +23,12 @@ def __init__(self, patient_id: str): self._start_date: dt.date | None = None self._end_date: dt.date | None = None self._additional_treatment: str | None = None + self._sequence_id: int | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.TREATMENT,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.TREATMENT, Fields.SEQUENCE_ID) + @property def patient_id(self) -> str: return self._patient_id @@ -88,6 +93,18 @@ def end_date(self, value: dt.date | None) -> None: validator=StrictValidators.validate_optional_date, ) + @property + def sequence_id(self) -> int | None: + return self._sequence_id + + @sequence_id.setter + def sequence_id(self, value: int | None) -> None: + self._set_validated_prop( + prop=self.__class__.sequence_id, + value=value, + validator=StrictValidators.validate_optional_int, + ) + @property def additional_treatment(self) -> str | None: return self._additional_treatment @@ -108,5 +125,6 @@ def __repr__(self, delim=",") -> str: f" treatment_sequence_number={self.treatment_sequence_number!r}{delim}" f" start_date={self.start_date!r}{delim}" f" end_date={self.end_date!r}{delim}" + f" sequence_id={self.sequence_id!r}{delim}" f" additional_treatment={self.additional_treatment!r})" ) diff --git a/src/omop_etl/harmonization/models/patient.py b/src/omop_etl/harmonization/models/patient.py index 1034bbf..6374bbf 100644 --- a/src/omop_etl/harmonization/models/patient.py +++ b/src/omop_etl/harmonization/models/patient.py @@ -14,7 +14,7 @@ from omop_etl.harmonization.models.domain.eq5d import EQ5D from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment import TumorAssessment @@ -99,7 +99,7 @@ def __init__(self, patient_id: str, trial_id: str): # collections self._medical_histories: tuple[MedicalHistory, ...] = () - self._previous_treatments: tuple[PreviousTreatments, ...] = () + self._previous_treatments: tuple[PreviousTreatment, ...] = () self._treatment_cycles: tuple[TreatmentCycleComponent, ...] = () self._concomitant_medications: tuple[ConcomitantMedication, ...] = () self._adverse_events: tuple[AdverseEvent, ...] = () @@ -412,13 +412,13 @@ def medical_histories(self, value: Sequence[MedicalHistory] | None) -> None: self.updated_fields.add(setter_name(self.__class__.medical_histories)) @property - def previous_treatments(self) -> tuple[PreviousTreatments, ...]: + def previous_treatments(self) -> tuple[PreviousTreatment, ...]: return self._previous_treatments @previous_treatments.setter - def previous_treatments(self, value: Sequence[PreviousTreatments] | None) -> None: + def previous_treatments(self, value: Sequence[PreviousTreatment] | None) -> None: self._previous_treatments = self.validate_collection( - value, item_type=PreviousTreatments, patient_id=self._patient_id, field_name=setter_name(self.__class__.previous_treatments) + value, item_type=PreviousTreatment, patient_id=self._patient_id, field_name=setter_name(self.__class__.previous_treatments) ) self.updated_fields.add(setter_name(self.__class__.previous_treatments)) diff --git a/src/omop_etl/omop/builders/base.py b/src/omop_etl/omop/builders/base.py index c38a1b0..1e67a4c 100644 --- a/src/omop_etl/omop/builders/base.py +++ b/src/omop_etl/omop/builders/base.py @@ -67,7 +67,7 @@ def build_and_populate(self, ctx: BuildContext) -> list[T]: self.populate_context(rows, ctx) return rows - def generate_row_id(self, *key_parts: str | None) -> int: + def generate_row_id(self, *key_parts: int | str | float | dt.date | None) -> int: """ Deterministic row ID from key parts, using SHA1 hashing with builder's namespace to create a reproducible 63-bit integer ID. diff --git a/src/omop_etl/omop/builders/condition_occurrence.py b/src/omop_etl/omop/builders/condition_occurrence.py index 0119399..7134a81 100644 --- a/src/omop_etl/omop/builders/condition_occurrence.py +++ b/src/omop_etl/omop/builders/condition_occurrence.py @@ -107,6 +107,11 @@ def _build_medical_history_rows( log.warning("Skipping medical history %d for %s: missing start_date", index, patient.patient_id) return [] + sequence_id = mh.sequence_id if mh.sequence_id else None + if not sequence_id: + log.warning("Skipping medical history for %s: missing sequence_id", patient.patient_id) + return [] + matches = self.concepts.lookup_semantic( patient.patient_id, (Patient.Collections.MEDICAL_HISTORIES, MedicalHistory.Fields.TERM), @@ -121,11 +126,12 @@ def _build_medical_history_rows( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - str(mh.sequence_id), - str(concept.concept_id), + start_date, + sequence_id, + concept.concept_id, ), person_id=person_id, - condition_concept_id=int(concept.concept_id), + condition_concept_id=concept.concept_id, condition_start_date=start_date, condition_end_date=mh.end_date, condition_type_concept_id=condition_type_concept_id, @@ -144,10 +150,15 @@ def _build_adverse_event_rows( ) -> list[ConditionOccurrenceRow]: start_date = ae.start_date term = ae.term + if start_date is None: log.warning("Skipping adverse event %d for %s: missing start_date", index, patient.patient_id) return [] + sequence_id = ae.sequence_id if ae.sequence_id else None + if not sequence_id: + log.warning("medical history for %s is missing sequence_id", patient.patient_id) + matches = self.concepts.lookup_semantic( patient.patient_id, (Patient.Collections.ADVERSE_EVENTS, AdverseEvent.Fields.TERM), @@ -163,8 +174,9 @@ def _build_adverse_event_rows( patient.patient_id, Patient.Collections.ADVERSE_EVENTS, term, - start_date.strftime(format="%Y%m%d"), - str(concept.concept_id), + start_date, + sequence_id, + concept.concept_id, ), person_id=person_id, condition_concept_id=int(concept.concept_id), diff --git a/src/omop_etl/omop/builders/drug_exposure.py b/src/omop_etl/omop/builders/drug_exposure.py index 3786deb..8d43411 100644 --- a/src/omop_etl/omop/builders/drug_exposure.py +++ b/src/omop_etl/omop/builders/drug_exposure.py @@ -3,7 +3,7 @@ from omop_etl.harmonization.models.patient import Patient from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.omop.builders.base import OmopBuilder, BuildContext from omop_etl.omop.models.rows import DrugExposureRow @@ -149,7 +149,7 @@ def _build_previous_treatment_main_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, drug_type_concept_id: int, ) -> list[DrugExposureRow]: @@ -160,7 +160,7 @@ def _build_previous_treatment_main_rows( end_date = prev.end_date or start_date matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), index, domains={OmopDomain.DRUG}, ) @@ -173,7 +173,7 @@ def _build_previous_treatment_main_rows( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, str(prev.treatment_sequence_number), - PreviousTreatments.Fields.TREATMENT, + PreviousTreatment.Fields.TREATMENT, str(concept.concept_id), ), person_id=person_id, @@ -190,7 +190,7 @@ def _build_previous_treatment_additional_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, drug_type_concept_id: int, ) -> list[DrugExposureRow]: @@ -201,7 +201,7 @@ def _build_previous_treatment_additional_rows( end_date = prev.end_date or start_date matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), index, domains={OmopDomain.DRUG}, ) @@ -214,7 +214,7 @@ def _build_previous_treatment_additional_rows( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, str(prev.treatment_sequence_number), - PreviousTreatments.Fields.ADDITIONAL_TREATMENT, + PreviousTreatment.Fields.ADDITIONAL_TREATMENT, str(concept.concept_id), ), person_id=person_id, diff --git a/src/omop_etl/omop/builders/procedure_occurrence.py b/src/omop_etl/omop/builders/procedure_occurrence.py index 396d947..ff9274f 100644 --- a/src/omop_etl/omop/builders/procedure_occurrence.py +++ b/src/omop_etl/omop/builders/procedure_occurrence.py @@ -2,7 +2,7 @@ from logging import getLogger from omop_etl.harmonization.models.patient import Patient -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.medical_history import MedicalHistory from omop_etl.omop.builders.base import OmopBuilder, BuildContext from omop_etl.omop.models.rows import ProcedureOccurrenceRow @@ -46,7 +46,7 @@ def _build_previous_treatment_main_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, procedure_type_concept_id: int, ) -> list[ProcedureOccurrenceRow]: @@ -56,7 +56,7 @@ def _build_previous_treatment_main_rows( matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), index, domains={OmopDomain.PROCEDURE}, ) @@ -69,7 +69,7 @@ def _build_previous_treatment_main_rows( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, str(prev.treatment_sequence_number), - PreviousTreatments.Fields.TREATMENT, + PreviousTreatment.Fields.TREATMENT, str(concept.concept_id), ), person_id=person_id, @@ -86,7 +86,7 @@ def _build_previous_treatment_additional_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, procedure_type_concept_id: int, ) -> list[ProcedureOccurrenceRow]: @@ -96,7 +96,7 @@ def _build_previous_treatment_additional_rows( matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), index, domains={OmopDomain.PROCEDURE}, ) @@ -109,7 +109,7 @@ def _build_previous_treatment_additional_rows( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, str(prev.treatment_sequence_number), - PreviousTreatments.Fields.ADDITIONAL_TREATMENT, + PreviousTreatment.Fields.ADDITIONAL_TREATMENT, str(concept.concept_id), ), person_id=person_id, diff --git a/src/omop_etl/semantic_mapping/core/semantic_config.py b/src/omop_etl/semantic_mapping/core/semantic_config.py index d66ebcc..b35880a 100644 --- a/src/omop_etl/semantic_mapping/core/semantic_config.py +++ b/src/omop_etl/semantic_mapping/core/semantic_config.py @@ -4,7 +4,7 @@ from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_type import TumorType @@ -33,13 +33,13 @@ # previous treatments FieldConfig( name="previous_treatments.treatment", - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), target=QueryTarget(domains={OmopDomain.PROCEDURE, OmopDomain.DRUG}), tags={"previous_treatments", "term"}, ), FieldConfig( name="previous_treatments.additional_treatment", - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), target=QueryTarget(domains={OmopDomain.PROCEDURE, OmopDomain.DRUG}), tags={"previous_treatments", "additional_term"}, ), diff --git a/tests/harmonization/conftest.py b/tests/harmonization/conftest.py index b9a85ec..83e83fa 100644 --- a/tests/harmonization/conftest.py +++ b/tests/harmonization/conftest.py @@ -1459,6 +1459,7 @@ class AdverseEventRow: FU_FUPDEDAT: str | None = None TR_TRNAME: str | None = None TR_TRTNO: int | None = None + AE_AESPID: int | None = None @pytest.fixture @@ -1484,6 +1485,7 @@ def adverse_events_fixture() -> pl.DataFrame: AE_AETRT2="Drug B", TR_TRNAME="Regimen X", TR_TRTNO="1", # type: ignore + AE_AESPID=1, ), AdverseEventRow( "serious_fill_end_from_death", @@ -1499,6 +1501,7 @@ def adverse_events_fixture() -> pl.DataFrame: FU_FUPDEDAT="1900-02-01", TR_TRNAME="Regimen Y", TR_TRTNO=2, + AE_AESPID=1, ), AdverseEventRow( "multi", @@ -1509,6 +1512,7 @@ def adverse_events_fixture() -> pl.DataFrame: AE_AESERCD=0, AE_AEREL1CD=2, AE_AEREL2CD=4, + AE_AESPID=1, ), AdverseEventRow( "multi", @@ -1519,6 +1523,7 @@ def adverse_events_fixture() -> pl.DataFrame: AE_SAEEXP1CD=2, AE_SAEEXP2CD=1, AE_AEREL2CD=1, + AE_AESPID=2, ), ] diff --git a/tests/harmonization/harmonizers/test_impress.py b/tests/harmonization/harmonizers/test_impress.py index ce94787..c14fd85 100644 --- a/tests/harmonization/harmonizers/test_impress.py +++ b/tests/harmonization/harmonizers/test_impress.py @@ -750,9 +750,9 @@ def test_returns_expected_columns(self, previous_treatment_fixture): df = h._process_previous_treatments() assert df is not None - from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments + from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment - expected_cols = {"SubjectId"} | set(PreviousTreatments.data_fields()) + expected_cols = {"SubjectId"} | set(PreviousTreatment.data_fields()) assert set(df.columns) == expected_cols def test_extracts_treatment_values(self, previous_treatment_fixture): diff --git a/tests/harmonization/models/test_schema_validation.py b/tests/harmonization/models/test_schema_validation.py index d3b824e..5567c04 100644 --- a/tests/harmonization/models/test_schema_validation.py +++ b/tests/harmonization/models/test_schema_validation.py @@ -14,7 +14,7 @@ from omop_etl.harmonization.models.domain.eq5d import EQ5D from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment import TumorAssessment @@ -34,10 +34,13 @@ def _get_all_constant_values(*sections) -> set[str]: return values +_DOMAIN_BASE_PROPERTIES = {name for name, attr in vars(DomainBase).items() if isinstance(attr, property)} + + def _get_data_properties(cls) -> set[str]: props = set() for name in dir(cls): - if name.startswith("_"): + if name.startswith("_") or name in _DOMAIN_BASE_PROPERTIES: continue attr = getattr(cls, name, None) if isinstance(attr, property): @@ -73,7 +76,7 @@ def test_no_extra_constants(self): EQ5D, FollowUp, MedicalHistory, - PreviousTreatments, + PreviousTreatment, StudyDrugs, TreatmentCycleComponent, TumorAssessment, diff --git a/tests/omop/builders/test_drug_exposure_builder.py b/tests/omop/builders/test_drug_exposure_builder.py index 14f5f49..963925b 100644 --- a/tests/omop/builders/test_drug_exposure_builder.py +++ b/tests/omop/builders/test_drug_exposure_builder.py @@ -2,7 +2,7 @@ from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.patient import Patient from omop_etl.omop.builders.drug_exposure import DrugExposureBuilder @@ -337,7 +337,7 @@ def test_additional_treatment_maps_to_drug(self, static_index, structural_index) semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=1524674, name="zoledronic acid", @@ -346,7 +346,7 @@ def test_additional_treatment_maps_to_drug(self, static_index, structural_index) ) ) patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Chemotherapy" prev.additional_treatment = "Zometa" prev.start_date = dt.date(2022, 6, 1) @@ -366,7 +366,7 @@ def test_main_treatment_maps_to_drug(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=1304850, name="letrozole", @@ -375,7 +375,7 @@ def test_main_treatment_maps_to_drug(self, static_index, structural_index): ) ) patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Letrozole" prev.additional_treatment = "Additional" prev.start_date = dt.date(2022, 1, 1) @@ -391,7 +391,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=100, name="drug a", @@ -400,7 +400,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i ), SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=200, name="drug b", @@ -409,7 +409,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i ), ) patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Drug A" prev.additional_treatment = "Drug B" prev.start_date = dt.date(2022, 1, 1) @@ -425,7 +425,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i def test_no_drug_mapping_produces_no_row(self, static_index, structural_index): """Treatment fields without drug domain mappings produce no rows.""" patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "paracetamol" prev.start_date = dt.date(2022, 6, 1) patient.previous_treatments = [prev] @@ -436,7 +436,7 @@ def test_no_drug_mapping_produces_no_row(self, static_index, structural_index): def test_missing_start_date_skips(self, static_index, structural_index): patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Chemotherapy" prev.additional_treatment = "Zometa" patient.previous_treatments = [prev] @@ -556,7 +556,7 @@ def test_ids_unique_across_sources(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=1524674, name="zoledronic acid", @@ -571,7 +571,7 @@ def test_ids_unique_across_sources(self, static_index, structural_index): cycle.start_date = dt.date(2023, 1, 1) patient.treatment_cycles = [cycle] - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Chemotherapy" prev.additional_treatment = "Zometa" prev.start_date = dt.date(2022, 1, 1) diff --git a/tests/omop/builders/test_procedure_occurrence.py b/tests/omop/builders/test_procedure_occurrence.py index 0274613..9db8d2c 100644 --- a/tests/omop/builders/test_procedure_occurrence.py +++ b/tests/omop/builders/test_procedure_occurrence.py @@ -2,7 +2,7 @@ from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.patient import Patient from omop_etl.omop.builders.procedure_occurrence import ProcedureOccurrenceBuilder from omop_etl.omop.core.id_generator import sha1_bigint @@ -37,7 +37,7 @@ def test_all_fields(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -46,7 +46,7 @@ def test_all_fields(self, static_index, structural_index): ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) prev.end_date = dt.date(2021, 3, 1) @@ -66,7 +66,7 @@ def test_all_fields(self, static_index, structural_index): def test_no_procedure_match_skips(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] @@ -78,7 +78,7 @@ def test_no_procedure_match_skips(self, static_index, structural_index): def test_missing_start_date_skips(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" patient.previous_treatments = [prev] @@ -90,7 +90,7 @@ def test_end_date_can_be_none(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -99,7 +99,7 @@ def test_end_date_can_be_none(self, static_index, structural_index): ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] @@ -115,7 +115,7 @@ def test_additional_treatment_produces_row(self, static_index, structural_index) semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=4061650, name="hormone therapy", @@ -124,7 +124,7 @@ def test_additional_treatment_produces_row(self, static_index, structural_index) ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Other" prev.additional_treatment = "Hormone therapy" prev.start_date = dt.date(2021, 5, 1) @@ -139,7 +139,7 @@ def test_additional_treatment_produces_row(self, static_index, structural_index) def test_no_match_skips(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Other" prev.additional_treatment = "Something unmapped" prev.start_date = dt.date(2021, 5, 1) @@ -154,7 +154,7 @@ def test_both_fields_produce_separate_rows(self, static_index, structural_index) semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -162,7 +162,7 @@ def test_both_fields_produce_separate_rows(self, static_index, structural_index) ), SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=4061650, name="hormone therapy", @@ -171,7 +171,7 @@ def test_both_fields_produce_separate_rows(self, static_index, structural_index) ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.additional_treatment = "Hormone therapy" prev.start_date = dt.date(2021, 3, 1) @@ -247,7 +247,7 @@ def test_all_sources_combined(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -265,7 +265,7 @@ def test_all_sources_combined(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] @@ -286,7 +286,7 @@ def test_row_ids_are_deterministic(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -295,7 +295,7 @@ def test_row_ids_are_deterministic(self, static_index, structural_index): ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] diff --git a/tests/omop/test_service.py b/tests/omop/test_service.py index 86e5845..69ee0a7 100644 --- a/tests/omop/test_service.py +++ b/tests/omop/test_service.py @@ -4,7 +4,7 @@ from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment_baseline import TumorAssessmentBaseline from omop_etl.harmonization.models.domain.tumor_type import TumorType @@ -83,7 +83,7 @@ def test_all_builders_produce_output(self, static_index, structural_index): ), SemanticEntry( "p1", - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), 0, 4301351, "surgery", @@ -127,7 +127,7 @@ def test_all_builders_produce_output(self, static_index, structural_index): concom.sequence_id = 1 patient.concomitant_medications = [concom] - prev = PreviousTreatments(patient_id="p1") + prev = PreviousTreatment(patient_id="p1") prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] From 8b583784e38a767fb2d65f55425e58ce24c953fa Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 14:05:13 +0200 Subject: [PATCH 05/23] feat: natural key in collection domain models --- .../models/domain/adverse_event.py | 2 +- src/omop_etl/harmonization/models/domain/c30.py | 2 ++ .../models/domain/concomitant_medication.py | 3 +++ .../harmonization/models/domain/eq5d.py | 2 ++ .../models/domain/medical_history.py | 2 +- .../models/domain/previous_treatments.py | 17 +---------------- .../models/domain/treatment_cycle_component.py | 1 + .../models/domain/tumor_assessment.py | 3 +++ 8 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/omop_etl/harmonization/models/domain/adverse_event.py b/src/omop_etl/harmonization/models/domain/adverse_event.py index 7f79b12..edaac5a 100644 --- a/src/omop_etl/harmonization/models/domain/adverse_event.py +++ b/src/omop_etl/harmonization/models/domain/adverse_event.py @@ -30,7 +30,7 @@ class Fields: WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_2 = "was_serious_grade_expected_treatment_2" INVARIANT_FIELDS = (Fields.TERM,) - NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID, Fields.TERM) + NATURAL_KEY_FIELDS = (Fields.TERM, Fields.START_DATE, Fields.SEQUENCE_ID) def __init__(self, patient_id: str): self._patient_id = patient_id diff --git a/src/omop_etl/harmonization/models/domain/c30.py b/src/omop_etl/harmonization/models/domain/c30.py index e7d1747..7c28bb3 100644 --- a/src/omop_etl/harmonization/models/domain/c30.py +++ b/src/omop_etl/harmonization/models/domain/c30.py @@ -80,6 +80,8 @@ def __init__(self, patient_id: str): self._event_name: str | None = None # question fields default to None + NATURAL_KEY_FIELDS = (Fields.EVENT_NAME, Fields.DATE) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/concomitant_medication.py b/src/omop_etl/harmonization/models/domain/concomitant_medication.py index 9f2077d..6d61d8d 100644 --- a/src/omop_etl/harmonization/models/domain/concomitant_medication.py +++ b/src/omop_etl/harmonization/models/domain/concomitant_medication.py @@ -28,6 +28,9 @@ def __init__(self, patient_id: str): self._sequence_id: int | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.MEDICATION_NAME,) + NATURAL_KEY_FIELDS = (Fields.MEDICATION_NAME, Fields.START_DATE, Fields.SEQUENCE_ID) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/eq5d.py b/src/omop_etl/harmonization/models/domain/eq5d.py index 65f69f1..3c674e3 100644 --- a/src/omop_etl/harmonization/models/domain/eq5d.py +++ b/src/omop_etl/harmonization/models/domain/eq5d.py @@ -31,6 +31,8 @@ def __init__(self, patient_id: str): self._event_name: str | None = None self._qol_metric: int | None = None + NATURAL_KEY_FIELDS = (Fields.EVENT_NAME, Fields.DATE) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/medical_history.py b/src/omop_etl/harmonization/models/domain/medical_history.py index c16a8b6..6b58148 100644 --- a/src/omop_etl/harmonization/models/domain/medical_history.py +++ b/src/omop_etl/harmonization/models/domain/medical_history.py @@ -25,7 +25,7 @@ def __init__(self, patient_id: str): self.updated_fields: Set[str] = set() INVARIANT_FIELDS = (Fields.TERM,) - NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID, Fields.TERM) + NATURAL_KEY_FIELDS = (Fields.TERM, Fields.START_DATE, Fields.SEQUENCE_ID) @property def patient_id(self) -> str: diff --git a/src/omop_etl/harmonization/models/domain/previous_treatments.py b/src/omop_etl/harmonization/models/domain/previous_treatments.py index f6e657f..c97b99e 100644 --- a/src/omop_etl/harmonization/models/domain/previous_treatments.py +++ b/src/omop_etl/harmonization/models/domain/previous_treatments.py @@ -13,7 +13,6 @@ class Fields: START_DATE = "start_date" END_DATE = "end_date" ADDITIONAL_TREATMENT = "additional_treatment" - SEQUENCE_ID = "sequence_id" def __init__(self, patient_id: str): self._patient_id = patient_id @@ -23,11 +22,10 @@ def __init__(self, patient_id: str): self._start_date: dt.date | None = None self._end_date: dt.date | None = None self._additional_treatment: str | None = None - self._sequence_id: int | None = None self.updated_fields: Set[str] = set() INVARIANT_FIELDS = (Fields.TREATMENT,) - NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.TREATMENT, Fields.SEQUENCE_ID) + NATURAL_KEY_FIELDS = (Fields.TREATMENT, Fields.START_DATE, Fields.TREATMENT_SEQUENCE_NUMBER) @property def patient_id(self) -> str: @@ -93,18 +91,6 @@ def end_date(self, value: dt.date | None) -> None: validator=StrictValidators.validate_optional_date, ) - @property - def sequence_id(self) -> int | None: - return self._sequence_id - - @sequence_id.setter - def sequence_id(self, value: int | None) -> None: - self._set_validated_prop( - prop=self.__class__.sequence_id, - value=value, - validator=StrictValidators.validate_optional_int, - ) - @property def additional_treatment(self) -> str | None: return self._additional_treatment @@ -125,6 +111,5 @@ def __repr__(self, delim=",") -> str: f" treatment_sequence_number={self.treatment_sequence_number!r}{delim}" f" start_date={self.start_date!r}{delim}" f" end_date={self.end_date!r}{delim}" - f" sequence_id={self.sequence_id!r}{delim}" f" additional_treatment={self.additional_treatment!r})" ) diff --git a/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py b/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py index f34e49b..fac2db2 100644 --- a/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py +++ b/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py @@ -29,6 +29,7 @@ class Fields: WAS_TABLET_TAKEN_TO_PRESCRIPTION_IN_PREVIOUS_CYCLE = "was_tablet_taken_to_prescription_in_previous_cycle" INVARIANT_FIELDS = (Fields.SOURCE_TREATMENT_NAME,) + NATURAL_KEY_FIELDS = (Fields.SOURCE_TREATMENT_NAME, Fields.START_DATE, Fields.TREATMENT_NUMBER, Fields.CYCLE_NUMBER, Fields.COMPONENT_INDEX) def __init__(self, patient_id: str): # core diff --git a/src/omop_etl/harmonization/models/domain/tumor_assessment.py b/src/omop_etl/harmonization/models/domain/tumor_assessment.py index 9fe046b..1c9b208 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_assessment.py +++ b/src/omop_etl/harmonization/models/domain/tumor_assessment.py @@ -36,6 +36,9 @@ def __init__(self, patient_id: str): self._event_id: str | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.ASSESSMENT_TYPE,) + NATURAL_KEY_FIELDS = (Fields.ASSESSMENT_TYPE, Fields.DATE, Fields.EVENT_ID) + @property def patient_id(self) -> str: return self._patient_id From 548b5be4022c133e12f9d369d68b63cdf9a08a31 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 14:35:19 +0200 Subject: [PATCH 06/23] feat: natural keys for singleton domains --- .../harmonization/harmonizers/impress.py | 8 +++++- .../models/domain/best_overall_response.py | 5 ++-- .../harmonization/models/domain/biomarkers.py | 2 ++ .../models/domain/ecog_baseline.py | 2 ++ .../harmonization/models/domain/followup.py | 2 ++ .../models/domain/study_drugs.py | 25 ++++++++++++++++--- .../domain/tumor_assessment_baseline.py | 2 ++ .../harmonization/models/domain/tumor_type.py | 2 ++ 8 files changed, 42 insertions(+), 6 deletions(-) diff --git a/src/omop_etl/harmonization/harmonizers/impress.py b/src/omop_etl/harmonization/harmonizers/impress.py index 39b781c..cebb81b 100644 --- a/src/omop_etl/harmonization/harmonizers/impress.py +++ b/src/omop_etl/harmonization/harmonizers/impress.py @@ -534,6 +534,7 @@ def _process_study_drugs(self) -> pl.DataFrame: s2cd=PolarsParsers.to_optional_int64(pl.col("COH_COHALLO2__2CD")), s3=PolarsParsers.to_optional_utf8(pl.col("COH_COHALLO2__3")).str.strip_chars(), s3cd=PolarsParsers.to_optional_int64(pl.col("COH_COHALLO2__3CD")), + date=PolarsParsers.to_optional_date(pl.col("COH_EventDate")), ) # require at least one present .filter( @@ -584,7 +585,12 @@ def _process_study_drugs(self) -> pl.DataFrame: .sort("_row") .unique(subset=["SubjectId"], keep="last") .select( - "SubjectId", cols.PRIMARY_TREATMENT_DRUG, cols.PRIMARY_TREATMENT_DRUG_CODE, cols.SECONDARY_TREATMENT_DRUG, cols.SECONDARY_TREATMENT_DRUG_CODE + "SubjectId", + cols.PRIMARY_TREATMENT_DRUG, + cols.PRIMARY_TREATMENT_DRUG_CODE, + cols.SECONDARY_TREATMENT_DRUG, + cols.SECONDARY_TREATMENT_DRUG_CODE, + cols.DATE, ) ) diff --git a/src/omop_etl/harmonization/models/domain/best_overall_response.py b/src/omop_etl/harmonization/models/domain/best_overall_response.py index 63ae62f..520fc9f 100644 --- a/src/omop_etl/harmonization/models/domain/best_overall_response.py +++ b/src/omop_etl/harmonization/models/domain/best_overall_response.py @@ -11,8 +11,6 @@ class Fields: CODE = "code" DATE = "date" - INVARIANT_FIELDS = (Fields.RESPONSE,) - def __init__(self, patient_id: str): self._patient_id = patient_id self._response: str | None = None @@ -20,6 +18,9 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.RESPONSE,) + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/biomarkers.py b/src/omop_etl/harmonization/models/domain/biomarkers.py index ca8ec50..0e212cd 100644 --- a/src/omop_etl/harmonization/models/domain/biomarkers.py +++ b/src/omop_etl/harmonization/models/domain/biomarkers.py @@ -22,6 +22,8 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def gene_and_mutation(self) -> str | None: return self._gene_and_mutation diff --git a/src/omop_etl/harmonization/models/domain/ecog_baseline.py b/src/omop_etl/harmonization/models/domain/ecog_baseline.py index 759e35b..1ef089e 100644 --- a/src/omop_etl/harmonization/models/domain/ecog_baseline.py +++ b/src/omop_etl/harmonization/models/domain/ecog_baseline.py @@ -18,6 +18,8 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/followup.py b/src/omop_etl/harmonization/models/domain/followup.py index 7ba1aea..2e25d8c 100644 --- a/src/omop_etl/harmonization/models/domain/followup.py +++ b/src/omop_etl/harmonization/models/domain/followup.py @@ -16,6 +16,8 @@ def __init__(self, patient_id: str): self._date_lost_to_followup: dt.datetime | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE_LOST_TO_FOLLOWUP,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/study_drugs.py b/src/omop_etl/harmonization/models/domain/study_drugs.py index 3d1159b..1b31434 100644 --- a/src/omop_etl/harmonization/models/domain/study_drugs.py +++ b/src/omop_etl/harmonization/models/domain/study_drugs.py @@ -1,4 +1,5 @@ from typing import Set +import datetime as dt from omop_etl.harmonization.core.validators import StrictValidators from omop_etl.harmonization.models.domain.base import DomainBase @@ -10,6 +11,7 @@ class Fields: PRIMARY_TREATMENT_DRUG_CODE = "primary_treatment_drug_code" SECONDARY_TREATMENT_DRUG = "secondary_treatment_drug" SECONDARY_TREATMENT_DRUG_CODE = "secondary_treatment_drug_code" + DATE = "date" def __init__(self, patient_id: str): self._patient_id = patient_id @@ -17,8 +19,11 @@ def __init__(self, patient_id: str): self._primary_treatment_drug_code: int | None = None self._secondary_treatment_drug: str | None = None self._secondary_treatment_drug_code: int | None = None + self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def primary_treatment_drug(self) -> str | None: return self._primary_treatment_drug @@ -67,11 +72,25 @@ def secondary_treatment_drug_code(self, value: int | None) -> None: validator=StrictValidators.validate_optional_int, ) + @property + def date(self) -> dt.date | None: + return self._date + + @date.setter + def date(self, value: dt.date | None) -> None: + self._set_validated_prop( + prop=self.__class__.date, + value=value, + validator=StrictValidators.validate_optional_date, + ) + def __repr__(self, delim=","): return ( f"{self.__class__.__name__}(" f"primary_treatment_drug={self.primary_treatment_drug!r}{delim} " - f" primary_treatment_drug_code={self.primary_treatment_drug_code!r}{delim} " - f" secondary_treatment_drug={self.secondary_treatment_drug!r}{delim} " - f" secondary_treatment_drug_code={self.secondary_treatment_drug_code!r})" + f"primary_treatment_drug_code={self.primary_treatment_drug_code!r}{delim} " + f"secondary_treatment_drug={self.secondary_treatment_drug!r}{delim} " + f"secondary_treatment_drug_code={self.secondary_treatment_drug_code!r}{delim}" + f"date={self.date!r}" + f")" ) diff --git a/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py b/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py index 0070bdd..2c56034 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py +++ b/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py @@ -26,6 +26,8 @@ def __init__(self, patient_id: str): self._off_target_lesion_measurement_date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.ASSESSMENT_DATE,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/tumor_type.py b/src/omop_etl/harmonization/models/domain/tumor_type.py index 6606237..d74378e 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_type.py +++ b/src/omop_etl/harmonization/models/domain/tumor_type.py @@ -27,6 +27,8 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def icd10_code(self) -> str | None: return self._icd10_code From 98eda726e0d11c588929610ee525170913334c7f Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 14:44:37 +0200 Subject: [PATCH 07/23] feat: natural keys done --- src/omop_etl/harmonization/models/domain/adverse_event.py | 2 +- .../harmonization/models/domain/concomitant_medication.py | 2 +- src/omop_etl/harmonization/models/domain/medical_history.py | 2 +- src/omop_etl/harmonization/models/domain/previous_treatments.py | 2 +- .../harmonization/models/domain/treatment_cycle_component.py | 2 +- src/omop_etl/harmonization/models/domain/tumor_assessment.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/omop_etl/harmonization/models/domain/adverse_event.py b/src/omop_etl/harmonization/models/domain/adverse_event.py index edaac5a..8ee9e05 100644 --- a/src/omop_etl/harmonization/models/domain/adverse_event.py +++ b/src/omop_etl/harmonization/models/domain/adverse_event.py @@ -30,7 +30,7 @@ class Fields: WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_2 = "was_serious_grade_expected_treatment_2" INVARIANT_FIELDS = (Fields.TERM,) - NATURAL_KEY_FIELDS = (Fields.TERM, Fields.START_DATE, Fields.SEQUENCE_ID) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID) def __init__(self, patient_id: str): self._patient_id = patient_id diff --git a/src/omop_etl/harmonization/models/domain/concomitant_medication.py b/src/omop_etl/harmonization/models/domain/concomitant_medication.py index 6d61d8d..f515e0f 100644 --- a/src/omop_etl/harmonization/models/domain/concomitant_medication.py +++ b/src/omop_etl/harmonization/models/domain/concomitant_medication.py @@ -29,7 +29,7 @@ def __init__(self, patient_id: str): self.updated_fields: Set[str] = set() INVARIANT_FIELDS = (Fields.MEDICATION_NAME,) - NATURAL_KEY_FIELDS = (Fields.MEDICATION_NAME, Fields.START_DATE, Fields.SEQUENCE_ID) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID) @property def patient_id(self) -> str: diff --git a/src/omop_etl/harmonization/models/domain/medical_history.py b/src/omop_etl/harmonization/models/domain/medical_history.py index 6b58148..ce98550 100644 --- a/src/omop_etl/harmonization/models/domain/medical_history.py +++ b/src/omop_etl/harmonization/models/domain/medical_history.py @@ -25,7 +25,7 @@ def __init__(self, patient_id: str): self.updated_fields: Set[str] = set() INVARIANT_FIELDS = (Fields.TERM,) - NATURAL_KEY_FIELDS = (Fields.TERM, Fields.START_DATE, Fields.SEQUENCE_ID) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID) @property def patient_id(self) -> str: diff --git a/src/omop_etl/harmonization/models/domain/previous_treatments.py b/src/omop_etl/harmonization/models/domain/previous_treatments.py index c97b99e..fb6958c 100644 --- a/src/omop_etl/harmonization/models/domain/previous_treatments.py +++ b/src/omop_etl/harmonization/models/domain/previous_treatments.py @@ -25,7 +25,7 @@ def __init__(self, patient_id: str): self.updated_fields: Set[str] = set() INVARIANT_FIELDS = (Fields.TREATMENT,) - NATURAL_KEY_FIELDS = (Fields.TREATMENT, Fields.START_DATE, Fields.TREATMENT_SEQUENCE_NUMBER) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.TREATMENT_SEQUENCE_NUMBER) @property def patient_id(self) -> str: diff --git a/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py b/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py index fac2db2..5556465 100644 --- a/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py +++ b/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py @@ -29,7 +29,7 @@ class Fields: WAS_TABLET_TAKEN_TO_PRESCRIPTION_IN_PREVIOUS_CYCLE = "was_tablet_taken_to_prescription_in_previous_cycle" INVARIANT_FIELDS = (Fields.SOURCE_TREATMENT_NAME,) - NATURAL_KEY_FIELDS = (Fields.SOURCE_TREATMENT_NAME, Fields.START_DATE, Fields.TREATMENT_NUMBER, Fields.CYCLE_NUMBER, Fields.COMPONENT_INDEX) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.TREATMENT_NUMBER, Fields.CYCLE_NUMBER, Fields.COMPONENT_INDEX) def __init__(self, patient_id: str): # core diff --git a/src/omop_etl/harmonization/models/domain/tumor_assessment.py b/src/omop_etl/harmonization/models/domain/tumor_assessment.py index 1c9b208..f255f69 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_assessment.py +++ b/src/omop_etl/harmonization/models/domain/tumor_assessment.py @@ -37,7 +37,7 @@ def __init__(self, patient_id: str): self.updated_fields: Set[str] = set() INVARIANT_FIELDS = (Fields.ASSESSMENT_TYPE,) - NATURAL_KEY_FIELDS = (Fields.ASSESSMENT_TYPE, Fields.DATE, Fields.EVENT_ID) + NATURAL_KEY_FIELDS = (Fields.DATE, Fields.EVENT_ID) @property def patient_id(self) -> str: From bfa311c49afda46dc376827e65083eeca7d851e0 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 15:18:25 +0200 Subject: [PATCH 08/23] feat: deterministic sorting of collection domains by natural key before hydration, processors no longer have to care --- src/omop_etl/harmonization/models/domain/base.py | 4 ++++ src/omop_etl/harmonization/models/patient.py | 5 ++++- tests/harmonization/conftest.py | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/omop_etl/harmonization/models/domain/base.py b/src/omop_etl/harmonization/models/domain/base.py index 120950d..ce35317 100644 --- a/src/omop_etl/harmonization/models/domain/base.py +++ b/src/omop_etl/harmonization/models/domain/base.py @@ -41,6 +41,10 @@ def natural_key(self) -> tuple: def invariant_fields(self) -> tuple: return tuple(getattr(self, f) for f in self.INVARIANT_FIELDS) + def sort_key(self) -> tuple: + """None-safe sort key derived from natural_key, None values sort last.""" + return tuple((v is None, v) for v in self.natural_key()) + @classmethod def _derive_data_fields(cls) -> tuple[str, ...]: """Derive data fields from Fields inner class string constants.""" diff --git a/src/omop_etl/harmonization/models/patient.py b/src/omop_etl/harmonization/models/patient.py index 6374bbf..dc46890 100644 --- a/src/omop_etl/harmonization/models/patient.py +++ b/src/omop_etl/harmonization/models/patient.py @@ -5,6 +5,7 @@ from omop_etl.harmonization.core.validators import StrictValidators from omop_etl.harmonization.core.track_validated import TrackedValidated, setter_name +from omop_etl.harmonization.models.domain.base import DomainBase from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers @@ -22,7 +23,7 @@ from omop_etl.harmonization.models.domain.tumor_type import TumorType log = getLogger(__name__) -T = TypeVar("T") +T = TypeVar("T", bound=DomainBase) class Patient(TrackedValidated): @@ -565,6 +566,8 @@ def validate_collection( if existing != patient_id: raise ValueError(f"{field_name}: mismatched patient_id {existing!r} != {patient_id!r}") + # sort by domain natural_key so collections are deterministically ordered on assignment + items.sort(key=lambda x: x.sort_key()) return tuple(items) @classmethod diff --git a/tests/harmonization/conftest.py b/tests/harmonization/conftest.py index 83e83fa..946da2d 100644 --- a/tests/harmonization/conftest.py +++ b/tests/harmonization/conftest.py @@ -212,6 +212,7 @@ class StudyDrugsRow: COH_COHALLO2__2CD: str | None = None COH_COHALLO2__3: str | None = None COH_COHALLO2__3CD: str | None = None + COH_EventDate: str | None = None @pytest.fixture @@ -223,6 +224,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1__2CD="31", COH_COHALLO2__2="Tafinlar", COH_COHALLO2__2CD="10", + COH_EventDate="2021-06-01", ), StudyDrugsRow( "sd1_match_sd2_match", @@ -230,6 +232,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1CD="99", COH_COHALLO2="some drug 2", COH_COHALLO2CD="1", + COH_EventDate="2022-06-01", ), StudyDrugsRow( "sd1_mismatch1_sd2_mismatch1_2", @@ -237,6 +240,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1CD="10", COH_COHALLO2__2="mismatch_1_2", COH_COHALLO2__2CD="12", + COH_EventDate="2023-06-01", ), StudyDrugsRow( "sd1_mismatch2_sd2_mismatch2_1", @@ -244,6 +248,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1__2CD="50", COH_COHALLO2="mismatch_2_1", COH_COHALLO2CD="60", + COH_EventDate="2024-06-01", ), StudyDrugsRow( "sd_collision", @@ -253,6 +258,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO2__2CD="5", COH_COHALLO2__3="some_drug_3_2", COH_COHALLO2__3CD="999", + COH_EventDate="2025-06-01", ), ] From 364de062eeda505cf703a019211918f39382d245 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 15:18:53 +0200 Subject: [PATCH 09/23] feat: collection domain sorting test --- .../models/test_collection_sorting.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 tests/harmonization/models/test_collection_sorting.py diff --git a/tests/harmonization/models/test_collection_sorting.py b/tests/harmonization/models/test_collection_sorting.py new file mode 100644 index 0000000..64f0dfc --- /dev/null +++ b/tests/harmonization/models/test_collection_sorting.py @@ -0,0 +1,101 @@ +import datetime as dt + +from omop_etl.harmonization.models.patient import Patient +from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.medical_history import MedicalHistory + + +PATIENT_ID = "P001" + + +def _ae(start_date: dt.date | None, sequence_id: int | None, term: str = "nausea") -> AdverseEvent: + e = AdverseEvent(PATIENT_ID) + e.term = term + e.start_date = start_date + e.sequence_id = sequence_id + return e + + +class TestCollectionSorting: + def test_sorted_by_natural_key_on_assignment(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + unsorted = [ + _ae(dt.date(2024, 3, 1), 2), + _ae(dt.date(2024, 1, 1), 1), + _ae(dt.date(2024, 2, 1), 5), + ] + p.adverse_events = unsorted + + ordered = [(e.start_date, e.sequence_id) for e in p.adverse_events] + assert ordered == [ + (dt.date(2024, 1, 1), 1), + (dt.date(2024, 2, 1), 5), + (dt.date(2024, 3, 1), 2), + ] + + def test_tiebreak_by_secondary_natural_key_field(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + same_date = dt.date(2024, 1, 1) + p.adverse_events = [ + _ae(same_date, 3), + _ae(same_date, 1), + _ae(same_date, 2), + ] + assert [e.sequence_id for e in p.adverse_events] == [1, 2, 3] + + def test_none_values_sort_last(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + p.adverse_events = [ + _ae(None, 1), + _ae(dt.date(2024, 2, 1), 2), + _ae(dt.date(2024, 1, 1), 3), + ] + ordered = [(e.start_date, e.sequence_id) for e in p.adverse_events] + assert ordered == [ + (dt.date(2024, 1, 1), 3), + (dt.date(2024, 2, 1), 2), + (None, 1), + ] + + def test_none_in_secondary_field_sorts_last_within_group(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + same_date = dt.date(2024, 1, 1) + p.adverse_events = [ + _ae(same_date, None), + _ae(same_date, 2), + _ae(same_date, 1), + ] + assert [e.sequence_id for e in p.adverse_events] == [1, 2, None] + + def test_all_none_natural_key_is_stable(self): + """All-None keys produce equal sort keys, the order is stable.""" + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + a = _ae(None, None, term="A") + b = _ae(None, None, term="B") + c = _ae(None, None, term="C") + p.adverse_events = [a, b, c] + assert [e.term for e in p.adverse_events] == ["A", "B", "C"] + + def test_empty_collection(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + p.adverse_events = [] + assert p.adverse_events == () + + def test_works_across_domain_types(self): + """Same mechanism works on a different collection with the same NK shape.""" + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + mh1 = MedicalHistory(PATIENT_ID) + mh1.term = "hypertension" + mh1.start_date = dt.date(2024, 3, 1) + mh1.sequence_id = 1 + + mh2 = MedicalHistory(PATIENT_ID) + mh2.term = "diabetes" + mh2.start_date = dt.date(2024, 1, 1) + mh2.sequence_id = 2 + + p.medical_histories = [mh1, mh2] + assert [m.start_date for m in p.medical_histories] == [ + dt.date(2024, 1, 1), + dt.date(2024, 3, 1), + ] From 96ebb6b45855cbf84a6bb44b872a70f96828e334 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 16:06:46 +0200 Subject: [PATCH 10/23] feat: updated builder callsites to use natural keys in id generation --- .../omop/builders/condition_occurrence.py | 16 ++--- src/omop_etl/omop/builders/drug_exposure.py | 34 ++++------ src/omop_etl/omop/builders/measurement.py | 64 +++++++++---------- .../omop/builders/procedure_occurrence.py | 18 +++--- .../omop/builders/visit_occurrence.py | 4 +- tests/omop/builders/test_measurement.py | 2 +- 6 files changed, 65 insertions(+), 73 deletions(-) diff --git a/src/omop_etl/omop/builders/condition_occurrence.py b/src/omop_etl/omop/builders/condition_occurrence.py index 7134a81..3257baa 100644 --- a/src/omop_etl/omop/builders/condition_occurrence.py +++ b/src/omop_etl/omop/builders/condition_occurrence.py @@ -58,6 +58,7 @@ def _build_tumor_type_rows( domains={OmopDomain.CONDITION}, ) source_value = tumor.icd10_code + elif tumor.main_tumor_type: matches = self.concepts.lookup_semantic( patient.patient_id, @@ -66,6 +67,7 @@ def _build_tumor_type_rows( domains={OmopDomain.CONDITION}, ) source_value = tumor.main_tumor_type + else: log.warning("Skipping tumor type for %s: no icd10_code or main_tumor_type", patient.patient_id) return [] @@ -83,10 +85,11 @@ def _build_tumor_type_rows( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Singletons.TUMOR_TYPE, - str(concept.concept_id), + *tumor.natural_key(), + concept.concept_id, ), person_id=person_id, - condition_concept_id=int(concept.concept_id), + condition_concept_id=concept.concept_id, condition_start_date=date, condition_type_concept_id=condition_type_concept_id, condition_source_value=source_value, @@ -126,8 +129,7 @@ def _build_medical_history_rows( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - start_date, - sequence_id, + *mh.natural_key(), concept.concept_id, ), person_id=person_id, @@ -173,13 +175,11 @@ def _build_adverse_event_rows( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.ADVERSE_EVENTS, - term, - start_date, - sequence_id, + *ae.natural_key(), concept.concept_id, ), person_id=person_id, - condition_concept_id=int(concept.concept_id), + condition_concept_id=concept.concept_id, condition_start_date=start_date, condition_end_date=ae.end_date, condition_type_concept_id=condition_type_concept_id, diff --git a/src/omop_etl/omop/builders/drug_exposure.py b/src/omop_etl/omop/builders/drug_exposure.py index 8d43411..0339daf 100644 --- a/src/omop_etl/omop/builders/drug_exposure.py +++ b/src/omop_etl/omop/builders/drug_exposure.py @@ -93,20 +93,14 @@ def _build_treatment_cycle_rows( quantity = cycle.iv_dose_prescribed dose_unit = cycle.iv_dose_prescribed_unit iv_route = self.concepts.lookup_structural("iv", domains={"Route"}) - route_concept_id = int(iv_route.concept_id) if iv_route else None + route_concept_id = iv_route.concept_id if iv_route else None elif cycle.cycle_type and cycle.cycle_type == "oral": quantity = cycle.oral_dose_prescribed_per_day dose_unit = cycle.oral_dose_unit oral_route = self.concepts.lookup_structural("oral", domains={"Route"}) - route_concept_id = int(oral_route.concept_id) if oral_route else None + route_concept_id = oral_route.concept_id if oral_route else None - base_row_id_parts = ( - patient.patient_id, - Patient.Collections.TREATMENT_CYCLES, - str(cycle.cycle_number), - str(cycle.treatment_number), - str(cycle.component_index), - ) + base_row_id_parts = (patient.patient_id, Patient.Collections.TREATMENT_CYCLES, *cycle.natural_key()) end_date_or_start = end_date or start_date drug_source_value = cycle.source_treatment_name or cycle.ingredient_name @@ -130,9 +124,9 @@ def _build_treatment_cycle_rows( return [ DrugExposureRow( - drug_exposure_id=self.generate_row_id(*base_row_id_parts, str(concept.concept_id)), + drug_exposure_id=self.generate_row_id(*base_row_id_parts, concept.concept_id), person_id=person_id, - drug_concept_id=int(concept.concept_id), + drug_concept_id=concept.concept_id, drug_exposure_start_date=start_date, drug_exposure_end_date=end_date_or_start, drug_type_concept_id=drug_type_concept_id, @@ -172,12 +166,12 @@ def _build_previous_treatment_main_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), + *prev.natural_key(), PreviousTreatment.Fields.TREATMENT, - str(concept.concept_id), + concept.concept_id, ), person_id=person_id, - drug_concept_id=int(concept.concept_id), + drug_concept_id=concept.concept_id, drug_exposure_start_date=start_date, drug_exposure_end_date=end_date, drug_type_concept_id=drug_type_concept_id, @@ -213,9 +207,9 @@ def _build_previous_treatment_additional_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), + *prev.natural_key(), PreviousTreatment.Fields.ADDITIONAL_TREATMENT, - str(concept.concept_id), + concept.concept_id, ), person_id=person_id, drug_concept_id=int(concept.concept_id), @@ -260,7 +254,7 @@ def _build_concomitant_medication_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.CONCOMITANT_MEDICATIONS, - str(concom.sequence_id), + *concom.natural_key(), ), person_id=person_id, drug_concept_id=0, @@ -276,11 +270,11 @@ def _build_concomitant_medication_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.CONCOMITANT_MEDICATIONS, - str(concom.sequence_id), - str(concept.concept_id), + *concom.natural_key(), + concept.concept_id, ), person_id=person_id, - drug_concept_id=int(concept.concept_id), + drug_concept_id=concept.concept_id, drug_exposure_start_date=start_date, drug_exposure_end_date=end_date_or_start, drug_type_concept_id=drug_type_concept_id, diff --git a/src/omop_etl/omop/builders/measurement.py b/src/omop_etl/omop/builders/measurement.py index 7559796..e8a07cb 100644 --- a/src/omop_etl/omop/builders/measurement.py +++ b/src/omop_etl/omop/builders/measurement.py @@ -187,13 +187,13 @@ def _build_ecog_rows( row_id = self.generate_row_id( patient.patient_id, Patient.Singletons.ECOG_BASELINE, - date.strftime(format="%Y%m%d"), + *ecog_baseline.natural_key(), ) return [ MeasurementRow( measurement_id=row_id, person_id=person_id, - measurement_concept_id=int(ecog_test.concept_id), + measurement_concept_id=ecog_test.concept_id, measurement_date=date, measurement_type_concept_id=ecrf_concept, measurement_datetime=dt.datetime(date.year, date.month, date.day), @@ -255,10 +255,11 @@ def _build_biomarker_rows( patient.patient_id, Patient.Singletons.BIOMARKERS, field_name, - str(concept.concept_id), + *biomarkers.natural_key(), + concept.concept_id, ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -304,12 +305,13 @@ def _build_tumor_assessment_baseline_rows( patient.patient_id, Patient.Singletons.TUMOR_ASSESSMENT_BASELINE, TumorAssessmentBaseline.Fields.TARGET_LESION_SIZE, + *baseline.natural_key(), ) return [ MeasurementRow( measurement_id=row_id, person_id=person_id, - measurement_concept_id=int(lesion.concept_id), + measurement_concept_id=lesion.concept_id, measurement_date=date, measurement_datetime=dt.datetime(date.year, date.month, date.day), measurement_type_concept_id=ecrf_concept, @@ -360,16 +362,15 @@ def _build_tumor_assessment_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.TARGET_LESION_SIZE, + *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(lesion.concept_id), + measurement_concept_id=lesion.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, - value_as_number=float(size), + value_as_number=size, visit_occurrence_id=visit_occurrence_id, measurement_source_value=str(size)[:50], ) @@ -387,9 +388,8 @@ def _build_tumor_assessment_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.RECIST_RESPONSE, + *tumor_assessments.natural_key(), ), person_id=person_id, measurement_concept_id=int(concept.concept_id), @@ -412,9 +412,8 @@ def _build_tumor_assessment_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.IRECIST_RESPONSE, + *tumor_assessments.natural_key(), ), person_id=person_id, measurement_concept_id=int(concept.concept_id), @@ -437,9 +436,8 @@ def _build_tumor_assessment_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.RANO_RESPONSE, + *tumor_assessments.natural_key(), ), person_id=person_id, measurement_concept_id=int(concept.concept_id), @@ -508,12 +506,12 @@ def _build_c30_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.C30_COLLECTION, - str(c30.event_name), - date.strftime(format="%Y%m%d"), + test_concept.concept_id, + *c30.natural_key(), f"q{n}", ), person_id=person_id, - measurement_concept_id=int(test_concept.concept_id), + measurement_concept_id=test_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -573,12 +571,11 @@ def _build_eq5d_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.EQ5D_COLLECTION, - str(eq5d.event_name), - date.strftime(format="%Y%m%d"), + *eq5d.natural_key(), f"q{n}", ), person_id=person_id, - measurement_concept_id=int(answer_concept.concept_id), + measurement_concept_id=answer_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -598,12 +595,12 @@ def _build_eq5d_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.EQ5D_COLLECTION, - str(eq5d.event_name), - date.strftime(format="%Y%m%d"), + *eq5d.natural_key(), + vas_concept.concept_id, EQ5D.Fields.QOL_METRIC, ), person_id=person_id, - measurement_concept_id=int(vas_concept.concept_id), + measurement_concept_id=vas_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -680,15 +677,16 @@ def _build_medical_history_rows( return [ MeasurementRow( + # same concept id produces multiple rows, so need concept_id and q_id in UID measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - str(mh.sequence_id), - str(m_concept.concept_id), - str(q_id), + *mh.natural_key(), + q_id, + m_concept.concept_id, ), person_id=person_id, - measurement_concept_id=int(m_concept.concept_id), + measurement_concept_id=m_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -768,16 +766,16 @@ def _build_adverse_event_rows( return [ MeasurementRow( + # same concept id produces multiple rows, so need concept_id and q_id in UID measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.ADVERSE_EVENTS, - term, - date.strftime(format="%Y%m%d"), - str(m_concept.concept_id), - str(q_id), + *ae.natural_key(), + q_id, + m_concept.concept_id, ), person_id=person_id, - measurement_concept_id=int(m_concept.concept_id), + measurement_concept_id=m_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, diff --git a/src/omop_etl/omop/builders/procedure_occurrence.py b/src/omop_etl/omop/builders/procedure_occurrence.py index ff9274f..cccd855 100644 --- a/src/omop_etl/omop/builders/procedure_occurrence.py +++ b/src/omop_etl/omop/builders/procedure_occurrence.py @@ -68,12 +68,12 @@ def _build_previous_treatment_main_rows( procedure_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), PreviousTreatment.Fields.TREATMENT, - str(concept.concept_id), + *prev.natural_key(), + concept.concept_id, ), person_id=person_id, - procedure_concept_id=int(concept.concept_id), + procedure_concept_id=concept.concept_id, procedure_date=start_date, procedure_end_date=prev.end_date, procedure_type_concept_id=procedure_type_concept_id, @@ -108,12 +108,12 @@ def _build_previous_treatment_additional_rows( procedure_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), + *prev.natural_key(), PreviousTreatment.Fields.ADDITIONAL_TREATMENT, - str(concept.concept_id), + concept.concept_id, ), person_id=person_id, - procedure_concept_id=int(concept.concept_id), + procedure_concept_id=concept.concept_id, procedure_date=start_date, procedure_end_date=prev.end_date, procedure_type_concept_id=procedure_type_concept_id, @@ -148,11 +148,11 @@ def _build_medical_history_rows( procedure_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - str(mh.sequence_id), - str(concept.concept_id), + *mh.natural_key(), + concept.concept_id, ), person_id=person_id, - procedure_concept_id=int(concept.concept_id), + procedure_concept_id=concept.concept_id, procedure_date=start_date, procedure_end_date=mh.end_date, procedure_type_concept_id=procedure_type_concept_id, diff --git a/src/omop_etl/omop/builders/visit_occurrence.py b/src/omop_etl/omop/builders/visit_occurrence.py index b89628b..9c8742e 100644 --- a/src/omop_etl/omop/builders/visit_occurrence.py +++ b/src/omop_etl/omop/builders/visit_occurrence.py @@ -93,7 +93,7 @@ def _build_baseline_row( row_id = self.generate_row_id( patient.patient_id, Patient.Singletons.TUMOR_ASSESSMENT_BASELINE, - date.strftime(format="%Y%m%d"), + *baseline.natural_key(), ) return VisitOccurrenceRow( @@ -120,7 +120,7 @@ def _build_assessment_row( row_id = self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - date.strftime(format="%Y%m%d"), + *assessment.natural_key(), ) return VisitOccurrenceRow( diff --git a/tests/omop/builders/test_measurement.py b/tests/omop/builders/test_measurement.py index f77e20c..2462f67 100644 --- a/tests/omop/builders/test_measurement.py +++ b/tests/omop/builders/test_measurement.py @@ -589,7 +589,7 @@ def test_dimension_uses_precoordinated_concept(self, static_index, structural_in assert row_1.person_id == PERSON_ID assert row_1.measurement_date == dt.date(2040, 5, 1) assert row_1.measurement_datetime == dt.datetime(2040, 5, 1) - assert row_1.measurement_id == 3952701007853139582 + assert row_1.measurement_id == 5607913108096982206 # q2 level 5 row_2 = rows[1] From f334d809e8e6e01c35aa225d00bc55b84e84798e Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Mon, 11 May 2026 16:26:42 +0200 Subject: [PATCH 11/23] feat: checking natural key colissions in hydration --- .../harmonization/harmonizers/base.py | 47 ++++++++++++ tests/harmonization/harmonizers/test_base.py | 74 +++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/src/omop_etl/harmonization/harmonizers/base.py b/src/omop_etl/harmonization/harmonizers/base.py index b524066..07ac2d8 100644 --- a/src/omop_etl/harmonization/harmonizers/base.py +++ b/src/omop_etl/harmonization/harmonizers/base.py @@ -66,6 +66,7 @@ class CollectionSpec(SpecBase): order_by: tuple[str, ...] = () require_order_by: bool = False items_col: str = "items" + on_natural_key_conflict: Literal["error", "warn"] = "warn" # union type for all specs @@ -84,6 +85,37 @@ def _derived_name(fn: Callable[..., Any]) -> str: return name.removeprefix("_process_") +def _check_natural_key_conflicts( + objs: list[DomainBase], + *, + patient_id: str, + item_type: type[DomainBase], + policy: Literal["error", "warn"], +) -> None: + """ + Detect natural-key collisions where the rows have differing data. + + Identical duplicates (same NK, same data) are assumed to be deduplicated + upstream by the collection processor, so this only flags conflicts. + Keeps the first occurrence. + """ + seen: dict[tuple, DomainBase] = {} + fields = item_type.data_fields() + for obj in objs: + nk = obj.natural_key() + prior = seen.get(nk) + if prior is None: + seen[nk] = obj + continue + if all(getattr(prior, f) == getattr(obj, f) for f in fields): + continue + diffs = {f: (getattr(prior, f), getattr(obj, f)) for f in fields if getattr(prior, f) != getattr(obj, f)} + msg = f"{item_type.__name__} natural-key conflict for patient {patient_id}: NK={nk} has conflicting values: {diffs}" + if policy == "error": + raise ValueError(msg) + log.warning(msg) + + def scalar( *, name: str | None = None, @@ -166,6 +198,7 @@ def collection( skip_missing_patients: bool = False, subject_col: str = "SubjectId", strict_schema: bool | None = None, + on_natural_key_conflict: Literal["error", "warn"] = "warn", ) -> Callable[[_F], _F]: """ Decorator: register a method as a collection-domain processor. @@ -187,6 +220,7 @@ def decorator(fn: _F) -> _F: skip_missing_patients=skip_missing_patients, subject_col=subject_col, strict_schema=strict_schema, + on_natural_key_conflict=on_natural_key_conflict, ) setattr(fn, _SPEC_ATTR, spec) return fn @@ -418,6 +452,7 @@ def _run_spec(self, spec: ProcessorSpec) -> None: items_col=spec.items_col, skip_missing_patients=spec.skip_missing_patients, mode=spec.mode, + on_natural_key_conflict=spec.on_natural_key_conflict, ) elif isinstance(spec, SingletonSpec): @@ -595,6 +630,7 @@ def hydrate_collection_field( item_type: type[DomainBase], patients: dict[str, Patient], mode: Literal["replace", "extend"] = "replace", + on_natural_key_conflict: Literal["error", "warn"] = "warn", ) -> None: """ Instantiate collection domain models onto Patient after schema validation. @@ -611,6 +647,9 @@ def hydrate_collection_field( item_type: Target domain class (used to resolve Patient attribute). patients: Map of patient_id to Patient instance. mode: "replace" overwrites, "extend" appends to existing collection. + on_natural_key_conflict: "warn" logs a warning when two instances share a natural key + but differ in other field values; "error" raises ValueError. Identical duplicates + (same NK, same data) are assumed to be deduplicated upstream. """ target_attr = Patient.get_attr_for_type(item_type) build = builder or item_type.from_row @@ -627,6 +666,14 @@ def hydrate_collection_field( except Exception as e: raise ValueError(f"{item_type.__name__} collection hydration failed for {sid=}") from e + if item_type.NATURAL_KEY_FIELDS: + _check_natural_key_conflicts( + objs, + patient_id=sid, + item_type=item_type, + policy=on_natural_key_conflict, + ) + if mode == "extend": existing = getattr(patient, target_attr, ()) or () objs = list(existing) + objs diff --git a/tests/harmonization/harmonizers/test_base.py b/tests/harmonization/harmonizers/test_base.py index 067be89..8006e1b 100644 --- a/tests/harmonization/harmonizers/test_base.py +++ b/tests/harmonization/harmonizers/test_base.py @@ -491,6 +491,80 @@ def builder(pid, row): assert [i.name for i in all_built] == ["a", "b"] +class TestNaturalKeyConflictDetection: + """Uses MedicalHistory (NATURAL_KEY_FIELDS = (start_date, sequence_id)) as the test domain.""" + + def _mh_row(self, *, start_date, sequence_id, term="hypertension", end_date=None, status=None, status_code=None): # noqa + return { + "term": term, + "sequence_id": sequence_id, + "start_date": start_date, + "end_date": end_date, + "status": status, + "status_code": status_code, + } + + def _packed(self, items): # noqa + return pl.DataFrame({"SubjectId": ["p1"], "items": [items]}) + + def test_identical_duplicates_pass_silently(self, caplog): + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [ + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1), + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1), + ] + with caplog.at_level("WARNING", logger="omop_etl.harmonization.harmonizers.base"): + BaseHarmonizer.hydrate_collection_field( + self._packed(items), + item_type=MedicalHistory, + patients=patients, + ) + assert not any("natural-key conflict" in r.message for r in caplog.records) + + def test_conflicting_data_logs_warning_by_default(self, caplog): + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [ + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="hypertension"), + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="diabetes"), + ] + with caplog.at_level("WARNING", logger="omop_etl.harmonization.harmonizers.base"): + BaseHarmonizer.hydrate_collection_field( + self._packed(items), + item_type=MedicalHistory, + patients=patients, + ) + warnings = [r for r in caplog.records if "natural-key conflict" in r.message] + assert len(warnings) == 1 + assert "p1" in warnings[0].message + assert "term" in warnings[0].message + + def test_conflicting_data_raises_under_error_policy(self): + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [ + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="hypertension"), + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="diabetes"), + ] + with pytest.raises(ValueError, match="natural-key conflict"): + BaseHarmonizer.hydrate_collection_field( + self._packed(items), + item_type=MedicalHistory, + patients=patients, + on_natural_key_conflict="error", + ) + + def test_empty_natural_key_skips_check(self, mock_simple_domain_attr, caplog): + """Domains without NATURAL_KEY_FIELDS bypass the conflict check entirely.""" + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [{"name": "a", "value": 1}, {"name": "a", "value": 2}] + with caplog.at_level("WARNING", logger="omop_etl.harmonization.harmonizers.base"): + BaseHarmonizer.hydrate_collection_field( + pl.DataFrame({"SubjectId": ["p1"], "items": [items]}), + item_type=SimpleDomain, + patients=patients, + ) + assert not any("natural-key conflict" in r.message for r in caplog.records) + + class TestHydrateScalar: def test_sets_scalar_attribute(self): """Scalar value should be set on patient.""" From 3debb722753beca37053e589cbcad1ccf08dc78c Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Tue, 12 May 2026 12:44:55 +0200 Subject: [PATCH 12/23] feat: Not Evaluable tumor assessments included to measurement table --- main.py | 4 +- src/omop_etl/omop/builders/measurement.py | 105 ++++++++++++++++-- .../static_mapped/static_mapping.csv | 10 +- .../static_mapped/structural_mapping.csv | 3 + tests/omop/builders/test_measurement.py | 43 ++++++- tests/omop/conftest.py | 14 ++- 6 files changed, 161 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 4323338..4faa26c 100644 --- a/main.py +++ b/main.py @@ -62,7 +62,7 @@ def run_pipeline(preprocessing_input: Path, base_root: Path, trial: str = "IMPRE meta=_meta, ) - print(f"Harmonized: {harmonized_result.patients[0:10]}") + # print(f"Harmonized: {harmonized_result.patients[0:10]}") # run semantic mapping semantic_mapper = SemanticService(outdir=base_root, layout=Layout.TRIAL_TIMESTAMP_RUN) @@ -89,7 +89,7 @@ def run_pipeline(preprocessing_input: Path, base_root: Path, trial: str = "IMPRE tables: OmopTables = omop_service.build(harmonized_result.patients) # todo: remove - print(f"built tables: {tables}") + # print(f"built tables: {tables}") # export concept lookup tracking (missed lookups, coverage stats) concept_service.export(formats="csv") diff --git a/src/omop_etl/omop/builders/measurement.py b/src/omop_etl/omop/builders/measurement.py index e8a07cb..8927f0a 100644 --- a/src/omop_etl/omop/builders/measurement.py +++ b/src/omop_etl/omop/builders/measurement.py @@ -379,10 +379,13 @@ def _build_tumor_assessment_rows( # tumor assessment response rows recist = tumor_assessments.recist_response if recist is not None: - concept = self.concepts.lookup_static("response_recist", recist, domains={OmopDomain.MEASUREMENTS}) - if concept is None: + recist_response_concept = self.concepts.lookup_static("response_recist", recist, domains={OmopDomain.MEASUREMENTS}) + recist_not_evaluable_concept = self.concepts.lookup_static("response_recist", recist, domains={OmopDomain.MEAS_VALUE}) + + if recist_response_concept is None and recist_not_evaluable_concept is None: log.warning("No response_recist mapping for %r (patient %s)", recist, patient.patient_id) - else: + + if recist_response_concept: rows.append( MeasurementRow( measurement_id=self.generate_row_id( @@ -392,7 +395,7 @@ def _build_tumor_assessment_rows( *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=recist_response_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -401,12 +404,39 @@ def _build_tumor_assessment_rows( ) ) + if recist_not_evaluable_concept: + recist_concept = self.concepts.lookup_structural("response_recist") + if recist_concept is None: + log.warning("No structural concept found for response_recist") + else: + rows.append( + MeasurementRow( + measurement_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.TUMOR_ASSESSMENTS, + TumorAssessment.Fields.RECIST_RESPONSE, + *tumor_assessments.natural_key(), + ), + person_id=person_id, + measurement_concept_id=recist_concept.concept_id, + value_as_concept_id=recist_not_evaluable_concept.concept_id, + measurement_date=date, + measurement_datetime=datetime_value, + measurement_type_concept_id=ecrf_concept, + visit_occurrence_id=visit_occurrence_id, + measurement_source_value=recist[:50], + ) + ) + irecist = tumor_assessments.irecist_response if irecist is not None: - concept = self.concepts.lookup_static("response_irecist", irecist, domains={OmopDomain.MEASUREMENTS}) - if concept is None: + irecist_response_concept = self.concepts.lookup_static("response_irecist", irecist, domains={OmopDomain.MEASUREMENTS}) + irecist_not_evaluable_concept = self.concepts.lookup_static("response_irecist", irecist, domains={OmopDomain.MEAS_VALUE}) + + if irecist_response_concept is None and irecist_not_evaluable_concept is None: log.warning("No response_irecist mapping for %r (patient %s)", irecist, patient.patient_id) - else: + + if irecist_response_concept: rows.append( MeasurementRow( measurement_id=self.generate_row_id( @@ -416,7 +446,7 @@ def _build_tumor_assessment_rows( *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=irecist_response_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -425,12 +455,39 @@ def _build_tumor_assessment_rows( ) ) + if irecist_not_evaluable_concept: + irecist_concept = self.concepts.lookup_structural("response_irecist") + if irecist_concept is None: + log.warning("No structural concept found for response_irecist") + else: + rows.append( + MeasurementRow( + measurement_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.TUMOR_ASSESSMENTS, + TumorAssessment.Fields.IRECIST_RESPONSE, + *tumor_assessments.natural_key(), + ), + person_id=person_id, + measurement_concept_id=irecist_concept.concept_id, + value_as_concept_id=irecist_not_evaluable_concept.concept_id, + measurement_date=date, + measurement_datetime=datetime_value, + measurement_type_concept_id=ecrf_concept, + visit_occurrence_id=visit_occurrence_id, + measurement_source_value=irecist[:50], + ) + ) + rano = tumor_assessments.rano_response if rano is not None: - concept = self.concepts.lookup_static("response_rano", rano, domains={OmopDomain.MEASUREMENTS}) - if concept is None: + rano_response_concept = self.concepts.lookup_static("response_rano", rano, domains={OmopDomain.MEASUREMENTS}) + rano_not_evaluable_concept = self.concepts.lookup_static("response_rano", rano, domains={OmopDomain.MEAS_VALUE}) + + if rano_response_concept is None and rano_not_evaluable_concept is None: log.warning("No response_rano mapping for %r (patient %s)", rano, patient.patient_id) - else: + + if rano_response_concept: rows.append( MeasurementRow( measurement_id=self.generate_row_id( @@ -440,7 +497,7 @@ def _build_tumor_assessment_rows( *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=rano_response_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -449,6 +506,30 @@ def _build_tumor_assessment_rows( ) ) + if rano_not_evaluable_concept: + rano_concept = self.concepts.lookup_structural("response_rano") + if rano_concept is None: + log.warning("No structural concept found for response_rano") + else: + rows.append( + MeasurementRow( + measurement_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.TUMOR_ASSESSMENTS, + TumorAssessment.Fields.RANO_RESPONSE, + *tumor_assessments.natural_key(), + ), + person_id=person_id, + measurement_concept_id=rano_concept.concept_id, + value_as_concept_id=rano_not_evaluable_concept.concept_id, + measurement_date=date, + measurement_datetime=datetime_value, + measurement_type_concept_id=ecrf_concept, + visit_occurrence_id=visit_occurrence_id, + measurement_source_value=rano[:50], + ) + ) + return rows def _build_c30_rows( diff --git a/src/omop_etl/resources/static_mapped/static_mapping.csv b/src/omop_etl/resources/static_mapped/static_mapping.csv index 25c37ec..56ab27f 100644 --- a/src/omop_etl/resources/static_mapped/static_mapping.csv +++ b/src/omop_etl/resources/static_mapped/static_mapping.csv @@ -21,15 +21,21 @@ ecog_code,5,45880868,LA9627-6,Dead,Answer,Standard,Valid,Meas Value,LOINC tumor_assessment_baseline,iRecist 1.1,734318,iRECIST,iRECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier tumor_assessment_baseline,RECIST 1.1,734317,RECIST,RECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier tumor_assessment_baseline,RANO (for Glioblastoma),734345,RANO,RANO finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier +response_recist,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_recist,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_recist,Stable Disease (SD),1634680,1.1_RECIST-SD,RECIST 1.1: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Progressive Disease (PD),1633597,1.1_RECIST-PD,RECIST 1.1: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Complete Response (CR),1634772,1.1_RECIST-CR,RECIST 1.1: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Partial Response (PR),1633368,1.1_RECIST-PR,RECIST 1.1: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier -response_irecist,iComplete Response (CR),1633954,iRECIST-CR,iRECIST: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier -response_irecist,iPartial Response (PR),1635284,iRECIST-PR,iRECIST: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier +response_irecist,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_irecist,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_irecist,iComplete Response,1633954,iRECIST-CR,iRECIST: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier +response_irecist,iPartial Response,1635284,iRECIST-PR,iRECIST: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iStable disease,1635887,iRECIST-SD,iRECIST: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iConfirmed progressive disease,1633423,iRECIST-PD,iRECIST: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iUnconfirmed progressive disease,1633423,iRECIST-PD,iRECIST: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier +response_rano,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_rano,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_rano,Complete Response (CR),1634853,RANO-CR,RANO: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_rano,Partial Response (PR),1634574,RANO-PR,RANO: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_rano,Stable Disease (SD),1633447,RANO-SD,RANO: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier diff --git a/src/omop_etl/resources/static_mapped/structural_mapping.csv b/src/omop_etl/resources/static_mapped/structural_mapping.csv index db3bb2b..1207c80 100644 --- a/src/omop_etl/resources/static_mapped/structural_mapping.csv +++ b/src/omop_etl/resources/static_mapped/structural_mapping.csv @@ -2,6 +2,9 @@ value_set,omop_concept_id,omop_concept_code,omop_concept_name,omop_concept_class ecog,36305384,89247-1,ECOG Performance Status score,Clinical Observation,Standard,Valid,Measurement,LOINC lesion_size,4084390,246116008,Lesion size,Observable Entity,Standard,Valid,Measurement,SNOMED number_of_lesions,4085855,246206008,Number of lesions,Observable Entity,Standard,Valid,Observation,SNOMED +response_recist,734317,RECIST,RECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier +response_irecist,734318,iRECIST,iRECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier +response_rano,734345,RANO,RANO finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier c30_q1,701340,OMOP5117524,"EORTC Quality of Life Questionnaire: Do you have any trouble doing strenuous activities, like carrying a heavy shopping bag or a suitcase?",Staging / Scales,Standard,Valid,Measurement,OMOP Extension c30_q2,701341,OMOP5117525,EORTC Quality of Life Questionnaire: Do you have any trouble taking a long walk?,Staging / Scales,Standard,Valid,Measurement,OMOP Extension c30_q3,701342,OMOP5117526,EORTC Quality of Life Questionnaire: Do you have any trouble taking a short walk outside of the house?,Staging / Scales,Standard,Valid,Measurement,OMOP Extension diff --git a/tests/omop/builders/test_measurement.py b/tests/omop/builders/test_measurement.py index 2462f67..61f016d 100644 --- a/tests/omop/builders/test_measurement.py +++ b/tests/omop/builders/test_measurement.py @@ -302,7 +302,7 @@ def test_irecist_with_divergent_source_string(self, static_index, structural_ind def test_unmapped_response_is_skipped(self, static_index, structural_index): patient = create_patient(PID, TRIAL) patient.tumor_assessments = [ - _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="Not Evaluable (NE)"), + _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="invalid"), ] rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(create_build_context(patient, PERSON_ID)) @@ -312,6 +312,47 @@ def test_unmapped_response_is_skipped(self, static_index, structural_index): assert rows[0].measurement_concept_id == 4084390 assert rows[0].value_as_number == 28.987 + def test_only_not_evaluable(self, static_index, structural_index): + patient = create_patient(PID, TRIAL) + patient.tumor_assessments = [ + _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="Not Evaluable (NE)"), + ] + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(create_build_context(patient, PERSON_ID)) + + # size row and Not Evaluable row + assert len(rows) == 2 + assert rows[0].measurement_concept_id == 4084390 # lesion size + assert rows[0].value_as_number == 28.987 + assert rows[1].measurement_concept_id == 734317 # RECIST structural + assert rows[1].value_as_concept_id == 45878793 # NE qualifier + assert rows[1].value_as_number is None + + def test_not_evaluable_and_evaluable_produce_four_rows(self, static_index, structural_index): + patient = create_patient(PID, TRIAL) + patient.tumor_assessments = [ + _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="Stable Disease (SD)"), + _make_tumor_assessments(dt.date(2040, 12, 22), "V06", size=300.0, recist="Not Evaluable"), + ] + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(create_build_context(patient, PERSON_ID)) + + # each TumorAssessment produces its own size row + its recist row: 4 total + assert len(rows) == 4 + + # V05: size & precoordinated SD response + assert rows[0].measurement_concept_id == 4084390 # lesion size + assert rows[0].value_as_number == 28.987 + assert rows[1].measurement_concept_id == 1634680 # RECIST SD precoordinated + assert rows[1].value_as_concept_id is None + + # V06: size & NE (structural RECIST and NE qualifier) + assert rows[2].measurement_concept_id == 4084390 + assert rows[2].value_as_number == 300.0 + assert rows[3].measurement_concept_id == 734317 + assert rows[3].value_as_concept_id == 45878793 + assert rows[3].value_as_number is None + def test_missing_date_returns_empty_for_instance(self, static_index, structural_index): patient = create_patient(PID, TRIAL) ta = TumorAssessment(PID) diff --git a/tests/omop/conftest.py b/tests/omop/conftest.py index cf04d13..1fbb208 100644 --- a/tests/omop/conftest.py +++ b/tests/omop/conftest.py @@ -131,6 +131,9 @@ def structural_index() -> dict[str, StructuralConcept]: "c30_q29": _structural("c30_q29", 701367, "measurement"), # EQ5D VAS "eq5d_qol_score": _structural("eq5d_qol_score", 42537274, "measurement"), + "response_recist": _structural("response_recist", 734317, "measurement"), + "response_irecist": _structural("response_irecist", 734318, "measurement"), + "response_ranop": _structural("response_rano", 734345, "measurement"), } @@ -165,16 +168,25 @@ def static_index() -> dict[tuple[str, str], StaticConcept]: ("eq5d_q2_answer_code", "3"): _static("eq5d_q2_answer_code", "3", 742353, "measurement"), ("eq5d_q2_answer_code", "4"): _static("eq5d_q2_answer_code", "4", 742354, "measurement"), ("eq5d_q2_answer_code", "5"): _static("eq5d_q2_answer_code", "5", 742355, "measurement"), - # tumor-response scales + # tumor-response scales: + # recist + ("response_recist", "not evaluable"): _static("response_recist", "Not evaluable", 45878793, "Meas value"), + ("response_recist", "not evaluable (ne)"): _static("response_recist", "Not evaluable (NE)", 45878793, "Meas value"), ("response_recist", "complete response (cr)"): _static("response_recist", "complete response (cr)", 1634772, "measurement"), ("response_recist", "partial response (pr)"): _static("response_recist", "partial response (pr)", 1633368, "measurement"), ("response_recist", "stable disease (sd)"): _static("response_recist", "stable disease (sd)", 1634680, "measurement"), ("response_recist", "progressive disease (pd)"): _static("response_recist", "progressive disease (pd)", 1633597, "measurement"), + # irecist + ("response_irecist", "not evaluable"): _static("response_irecist", "Not evaluable", 45878793, "Meas value"), + ("response_irecist", "not evaluable (ne)"): _static("response_irecist", "Not evaluable (NE)", 45878793, "Meas value"), ("response_irecist", "icomplete response (cr)"): _static("response_irecist", "icomplete response (cr)", 1633954, "measurement"), ("response_irecist", "ipartial response (pr)"): _static("response_irecist", "ipartial response (pr)", 1635284, "measurement"), ("response_irecist", "istable disease"): _static("response_irecist", "istable disease", 1635887, "measurement"), ("response_irecist", "iconfirmed progressive disease"): _static("response_irecist", "iconfirmed progressive disease", 1633423, "measurement"), ("response_irecist", "iunconfirmed progressive disease"): _static("response_irecist", "iunconfirmed progressive disease", 1633423, "measurement"), + # rano + ("response_rano", "not evaluable"): _static("response_rano", "Not evaluable", 45878793, "Meas value"), + ("response_rano", "not evaluable (ne)"): _static("response_rano", "Not evaluable (NE)", 45878793, "Meas value"), ("response_rano", "complete response (cr)"): _static("response_rano", "complete response (cr)", 1634853, "measurement"), ("response_rano", "partial response (pr)"): _static("response_rano", "partial response (pr)", 1634574, "measurement"), ("response_rano", "stable disease (sd)"): _static("response_rano", "stable disease (sd)", 1633447, "measurement"), From ba32870908b5793a5c16c76c562c0e5972c2d356 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Tue, 12 May 2026 12:47:04 +0200 Subject: [PATCH 13/23] chore: docstring update --- src/omop_etl/omop/builders/measurement.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/omop_etl/omop/builders/measurement.py b/src/omop_etl/omop/builders/measurement.py index 8927f0a..39b7f88 100644 --- a/src/omop_etl/omop/builders/measurement.py +++ b/src/omop_etl/omop/builders/measurement.py @@ -341,6 +341,10 @@ def _build_tumor_assessment_rows( measurement_concept_id stores both scale and answer (same pattern as EQ5D), value_as_concept_id stays NULL. + If response scale is Not Evaluable, use separate branch with structural lookup + for measurement concept id and value as concept id is then the NE Meas Value response concept, + so any Meas Value concept for this lookup key means assessment was Not Evaluable. + If date is missing the instance is skipped entirely and no rows are emitted. """ date = tumor_assessments.date From f1bfca89aebfdb705818f191887811efeea827c9 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Tue, 12 May 2026 14:04:37 +0200 Subject: [PATCH 14/23] feat: updated static mapping and structural and static lookups no longer log filter-rejects as missed concepts --- src/omop_etl/concept_mapping/core/models.py | 2 +- src/omop_etl/concept_mapping/service.py | 4 ++-- .../static_mapped/static_mapping.csv | 3 +++ tests/concept_mapping/test_service.py | 21 ++++++++++++++----- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/omop_etl/concept_mapping/core/models.py b/src/omop_etl/concept_mapping/core/models.py index 32f4fc5..4caa323 100644 --- a/src/omop_etl/concept_mapping/core/models.py +++ b/src/omop_etl/concept_mapping/core/models.py @@ -6,7 +6,7 @@ def _norm(v: str | None) -> str: - """Lowercase + strip a CSV value, defaulting None to empty string.""" + """Lowercase and strip a CSV value, defaulting None to empty string.""" return (v or "").lower().strip() diff --git a/src/omop_etl/concept_mapping/service.py b/src/omop_etl/concept_mapping/service.py index ccba401..9db0307 100644 --- a/src/omop_etl/concept_mapping/service.py +++ b/src/omop_etl/concept_mapping/service.py @@ -157,7 +157,7 @@ def lookup_static( validity=c.validity, ) if not _concept_matches_filter(concept, domains, vocabs, validity): - self._result.record_miss("static", value_set, local_value) + # concept mapped but rejected by filter return None self._result.record_match("static", value_set, local_value, concept) @@ -198,7 +198,7 @@ def lookup_structural( validity=c.validity, ) if not _concept_matches_filter(concept, domains, vocabs, validity): - self._result.record_miss("structural", value_set, "") + # concept mapped but rejected by filter return None self._result.record_match("structural", value_set, "", concept) diff --git a/src/omop_etl/resources/static_mapped/static_mapping.csv b/src/omop_etl/resources/static_mapped/static_mapping.csv index 56ab27f..eb458cc 100644 --- a/src/omop_etl/resources/static_mapped/static_mapping.csv +++ b/src/omop_etl/resources/static_mapped/static_mapping.csv @@ -22,12 +22,14 @@ tumor_assessment_baseline,iRecist 1.1,734318,iRECIST,iRECIST finding,Staging/Gra tumor_assessment_baseline,RECIST 1.1,734317,RECIST,RECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier tumor_assessment_baseline,RANO (for Glioblastoma),734345,RANO,RANO finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier response_recist,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_recist,Not evaluated,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_recist,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_recist,Stable Disease (SD),1634680,1.1_RECIST-SD,RECIST 1.1: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Progressive Disease (PD),1633597,1.1_RECIST-PD,RECIST 1.1: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Complete Response (CR),1634772,1.1_RECIST-CR,RECIST 1.1: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Partial Response (PR),1633368,1.1_RECIST-PR,RECIST 1.1: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_irecist,Not evaluated,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_irecist,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_irecist,iComplete Response,1633954,iRECIST-CR,iRECIST: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iPartial Response,1635284,iRECIST-PR,iRECIST: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier @@ -35,6 +37,7 @@ response_irecist,iStable disease,1635887,iRECIST-SD,iRECIST: stable disease,Stag response_irecist,iConfirmed progressive disease,1633423,iRECIST-PD,iRECIST: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iUnconfirmed progressive disease,1633423,iRECIST-PD,iRECIST: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_rano,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_rano,Not evaluated,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_rano,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_rano,Complete Response (CR),1634853,RANO-CR,RANO: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_rano,Partial Response (PR),1634574,RANO-PR,RANO: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier diff --git a/tests/concept_mapping/test_service.py b/tests/concept_mapping/test_service.py index 94ad9e4..5a8d272 100644 --- a/tests/concept_mapping/test_service.py +++ b/tests/concept_mapping/test_service.py @@ -116,14 +116,25 @@ def test_domain_filter_is_case_insensitive(self, static_index): result = service.lookup_static("sex", "M", domains={domain_filter}) assert result is not None, f"filter {domain_filter!r} should match" - def test_domain_filter_miss_is_case_insensitive(self, static_index): - """Wrong-domain filter misses regardless of case.""" + def test_domain_filter_rejects_regardless_of_case(self, static_index): + """Wrong-domain filter returns None regardless of case.""" service = ConceptLookupService(static_index=static_index) - result = service.lookup_static("sex", "M", domains={"Procedure"}) + for domain_filter in ("procedure", "PROCEDURE", "Procedure"): + result = service.lookup_static("sex", "M", domains={domain_filter}) + assert result is None, f"filter {domain_filter!r} should reject" + + def test_filter_reject_is_not_recorded_as_miss(self, static_index): + """ + An entry exists in the index but the requested filter rejects it. + This is a caller-side flow event, a builder is asking the wrong domain question, + and not a data-quality gap, so it must not be recorded in the missed-lookup log. + """ + service = ConceptLookupService(static_index=static_index) - assert result is None - assert len(service.result.missed["static"]) == 1 + service.lookup_static("sex", "M", domains={"Procedure"}) + + assert len(service.result.missed["static"]) == 0 def test_vocab_filter_matches(self, static_index): service = ConceptLookupService(static_index=static_index) From 9789cabb2f95f7287cbefbe324dc21810b58c246 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Tue, 12 May 2026 14:26:09 +0200 Subject: [PATCH 15/23] fix: eot_reason treatment complete mapping updated to observation domain --- src/omop_etl/resources/static_mapped/static_mapping.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omop_etl/resources/static_mapped/static_mapping.csv b/src/omop_etl/resources/static_mapped/static_mapping.csv index eb458cc..41f758b 100644 --- a/src/omop_etl/resources/static_mapped/static_mapping.csv +++ b/src/omop_etl/resources/static_mapped/static_mapping.csv @@ -131,14 +131,14 @@ eq5d_q5_answer_code,3,742368,OMOP5181578,EuroQol five dimension five level anxie eq5d_q5_answer_code,4,742369,OMOP5181579,EuroQol five dimension five level anxiety depression score: 4 (I am severely anxious or depressed),Precoordinated pair,Standard,Valid,Measurement,OMOP Extension eq5d_q5_answer_code,5,742370,OMOP5181580,EuroQol five dimension five level anxiety depression score: 5 (I am extremely anxious or depressed),Precoordinated pair,Standard,Valid,Measurement,OMOP Extension eot_reason,Disease progression,1617595,97509-4,Cancer disease progression,Clinical Observation,Standard,Valid,Observation,LOINC -eot_reason,Normal completion according to cohort-specific manual,45884335,LA4511-7,Treatment Completed,Answer,Standard,Valid,Meas Value,LOINC +eot_reason,Normal completion according to cohort-specific manual,44788181,15501000000100,Completed successfully,Qualifier Value,Standard,Valid,Observation,SNOMED eot_reason,Other,35821954,100418-5,Other reason,Answer,Standard,Valid,Observation,UK Biobank eot_reason,Adverse event/Toxicity,441207,62014003,Adverse reaction to drug,Disorder,Standard,Valid,Observation,SNOMED eot_reason,Symptomatic deterioration,4111347,285384003,General health deterioration,Clinical Finding,Standard,Valid,Observation,SNOMED eot_reason,Patient refusal,45773084,703427001,Refusal of treatment by patient against dental advice,Context-dependent,Standard,Valid,Observation,SNOMED eot_reason,Withdrawn by PI,44810920,871401000000109,Withdrawn from research study,Clinical Finding,Standard,Valid,Observation,SNOMED eot_reason_code,2,1617595,97509-4,Cancer disease progression,Clinical Observation,Standard,Valid,Observation,LOINC -eot_reason_code,7,45884335,LA4511-7,Treatment Completed,Answer,Standard,Valid,Meas Value,LOINC +eot_reason_code,7,44788181,15501000000100,Completed successfully,Qualifier Value,Standard,Valid,Observation,SNOMED eot_reason_code,88,35821954,100418-5,Other reason,Answer,Standard,Valid,Observation,UK Biobank eot_reason_code,1,441207,62014003,Adverse reaction to drug,Disorder,Standard,Valid,Observation,SNOMED eot_reason_code,6,4111347,285384003,General health deterioration,Clinical Finding,Standard,Valid,Observation,SNOMED From 1c1e2ebf0413f292749ae62df44e6e0b773cdf14 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Tue, 12 May 2026 17:40:34 +0200 Subject: [PATCH 16/23] feat: fk linkage from condition occurrance, mapping update, fk linkage tests --- src/omop_etl/omop/builders/base.py | 1 + .../omop/builders/condition_occurrence.py | 33 +++- .../semantic_mapped/braf_non-v600_mapped.csv | 2 +- .../builders/test_condition_occurrence.py | 167 ++++++++++++++++++ tests/omop/builders/test_measurement.py | 2 +- 5 files changed, 199 insertions(+), 6 deletions(-) diff --git a/src/omop_etl/omop/builders/base.py b/src/omop_etl/omop/builders/base.py index 1e67a4c..f1c1594 100644 --- a/src/omop_etl/omop/builders/base.py +++ b/src/omop_etl/omop/builders/base.py @@ -20,6 +20,7 @@ class BuildContext: patient: Patient person_id: int visit_id_by_date: dict[dt.date, int] = field(default_factory=dict) + condition_id_by_ae_sequence_id: dict[int, int] = field(default_factory=dict) class OmopBuilder(ABC, Generic[T]): diff --git a/src/omop_etl/omop/builders/condition_occurrence.py b/src/omop_etl/omop/builders/condition_occurrence.py index 3257baa..8d259e2 100644 --- a/src/omop_etl/omop/builders/condition_occurrence.py +++ b/src/omop_etl/omop/builders/condition_occurrence.py @@ -1,6 +1,7 @@ from typing import ClassVar from logging import getLogger +from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.patient import Patient from omop_etl.harmonization.models.domain.tumor_type import TumorType from omop_etl.harmonization.models.domain.medical_history import MedicalHistory @@ -22,7 +23,12 @@ class ConditionOccurrenceBuilder(OmopBuilder[ConditionOccurrenceRow]): table_name: ClassVar[str] = "condition_occurrence" + def __init__(self, concepts: ConceptLookupService): + super().__init__(concepts) + self._ae_to_condition_id: dict[int, int] = {} + def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: + self._ae_to_condition_id = {} patient = ctx.patient person_id = ctx.person_id rows: list[ConditionOccurrenceRow] = [] @@ -40,6 +46,13 @@ def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: return rows + def populate_context(self, rows: list[ConditionOccurrenceRow], ctx: BuildContext) -> None: + """ + Publish AE.sequence_id to condition_occurrence_id from accumulated mapping from build, + so other builders (e.g. Observation) can set field concept/event ids from AEs (e.g. was_serious, turned_serious_data). + """ + ctx.condition_id_by_ae_sequence_id.update(self._ae_to_condition_id) + def _build_tumor_type_rows( self, patient: Patient, @@ -157,9 +170,13 @@ def _build_adverse_event_rows( log.warning("Skipping adverse event %d for %s: missing start_date", index, patient.patient_id) return [] - sequence_id = ae.sequence_id if ae.sequence_id else None - if not sequence_id: - log.warning("medical history for %s is missing sequence_id", patient.patient_id) + sequence_id = ae.sequence_id + if sequence_id is None: + log.warning( + "Adverse event %d for %s is missing sequence_id, emitting row but no FK link will be published for observation_event_id", + index, + patient.patient_id, + ) matches = self.concepts.lookup_semantic( patient.patient_id, @@ -170,7 +187,7 @@ def _build_adverse_event_rows( if not matches: return [] - return [ + ae_rows = [ ConditionOccurrenceRow( condition_occurrence_id=self.generate_row_id( patient.patient_id, @@ -187,3 +204,11 @@ def _build_adverse_event_rows( ) for concept in matches ] + + # accumulate AE.sequence_id to first emitted condition_occurrence_id + # multi-concept AE links to the first row deterministically + # AEs without sequence_id are warned above and are emitted without linkage + if sequence_id is not None: + self._ae_to_condition_id[sequence_id] = ae_rows[0].condition_occurrence_id + + return ae_rows diff --git a/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv b/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv index 922ff2b..cc24ec3 100644 --- a/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv +++ b/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv @@ -173,7 +173,7 @@ fc4de2f6-c822-542e-8392-53cf03842961,CM_CMTRT,Cimetidin,1,997276,2541,cimetidine 9261aa9a-b652-5d50-b109-b5c92b058ae2,COH_COHCTN,BRAF Non-V600 activating mutations,12,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC a683d456-51bb-52f1-8dcb-b88cfe79add6,COH_COHCTN,BRAF Non-V600activating mutations,1,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC 8d9cba99-2d8f-5254-8226-cc57d315adc7,COH_COHTMN,BRAF Non-V600 activating mutations,13,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC -4bf9b6d6-5f7e-5c8e-b424-16b4885158df,COH_GENMUT1,BRAF activating mutations,11,13,3039156,53844-7,"BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test",Standard,Valid,Measurement,LOINC +4bf9b6d6-5f7e-5c8e-b424-16b4885158df,COH_GENMUT1,BRAF activating mutations,13,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC 2e020752-c463-5744-9859-8dd3e864d528,COH_GENMUT1,Other,2,4205432,55446002,Genetic mutation,Clinical Finding,Standard,Valid,Condition,SNOMED 16634bc9-9c93-5cd8-b149-34d0894a2e4b,COH_COHTT,Pancreatic cancer,3,4180793,363418001,Malignant tumor of pancreas,Disorder,Standard,Valid,Condition,SNOMED 88b3fa87-2b35-5bee-adf1-39ed0e10c828,COH_COHTT,Cholangiocarcinoma,3,4208660,312104005,Cholangiocarcinoma of biliary tract,Disorder,Standard,Valid,Condition,SNOMED diff --git a/tests/omop/builders/test_condition_occurrence.py b/tests/omop/builders/test_condition_occurrence.py index 97fe929..ce2c714 100644 --- a/tests/omop/builders/test_condition_occurrence.py +++ b/tests/omop/builders/test_condition_occurrence.py @@ -426,3 +426,170 @@ def test_row_ids_are_deterministic(self, static_index, structural_index): rows_b = ConditionOccurrenceBuilder(concepts).build(create_build_context(patient, PERSON_ID)) assert rows_a[0].condition_occurrence_id == rows_b[0].condition_occurrence_id + + +class TestAdverseEventFKLinkage: + """ + CDM 5.4 observation_event_id linkage: ConditionOccurrenceBuilder publishes + each AE's sequence_id, condition_occurrence_id into BuildContext so + ObservationBuilder can attribute was_serious & turned_serious_date back to + the AE's condition row. + """ + + def _ae_semantic(self, leaf_index: int, concept_id: int, name: str) -> SemanticEntry: # noqa + return SemanticEntry( + patient_id=PID, + field_path=(Patient.Collections.ADVERSE_EVENTS, AdverseEvent.Fields.TERM), + leaf_index=leaf_index, + concept_id=concept_id, + name=name, + domain="condition", + ) + + def test_publishes_link_when_sequence_id_set(self, static_index, structural_index): + semantic = create_semantic_index(self._ae_semantic(0, 437663, "fever")) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 42 + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 1 + assert ctx.condition_id_by_ae_sequence_id == {42: rows[0].condition_occurrence_id} + + def test_no_link_when_sequence_id_missing_but_row_still_emitted(self, static_index, structural_index, caplog): + """AE without sequence_id: row is emitted, but produces no FK entry and warns.""" + import logging + + semantic = create_semantic_index(self._ae_semantic(0, 437663, "fever")) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + with caplog.at_level(logging.WARNING): + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 1, "AE row must still be emitted when sequence_id is missing" + assert ctx.condition_id_by_ae_sequence_id == {} + assert any("missing sequence_id" in rec.message for rec in caplog.records) + + def test_no_link_when_no_semantic_match(self, static_index, structural_index): + """AE with sequence_id but no semantic match emits no row and no FK entry.""" + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "UnmappedTerm" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 7 + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert rows == [] + assert ctx.condition_id_by_ae_sequence_id == {} + + def test_multi_ae_each_linked_by_sequence_id(self, static_index, structural_index): + """Multiple AEs get their own FK entry keyed by their sequence_id.""" + semantic = create_semantic_index( + self._ae_semantic(0, 437663, "fever"), + self._ae_semantic(1, 4329847, "nausea"), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + + ae1 = AdverseEvent(patient_id=PID) + ae1.term = "Fever" + ae1.start_date = dt.date(2023, 3, 1) + ae1.sequence_id = 1 + + ae2 = AdverseEvent(patient_id=PID) + ae2.term = "Nausea" + ae2.start_date = dt.date(2023, 4, 1) + ae2.sequence_id = 2 + + patient.adverse_events = [ae1, ae2] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 2 + # both sequence_ids present and pointing to existing row ids + emitted_ids = {r.condition_occurrence_id for r in rows} + assert set(ctx.condition_id_by_ae_sequence_id.keys()) == {1, 2} + assert set(ctx.condition_id_by_ae_sequence_id.values()).issubset(emitted_ids) + + def test_mixed_seq_id_present_and_missing(self, static_index, structural_index): + """One AE with sequence_id and one without: only the first is linked, both emit rows.""" + semantic = create_semantic_index( + self._ae_semantic(0, 437663, "fever"), + self._ae_semantic(1, 4329847, "nausea"), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + + ae1 = AdverseEvent(patient_id=PID) + ae1.term = "Fever" + ae1.start_date = dt.date(2023, 3, 1) + ae1.sequence_id = 1 + + ae2 = AdverseEvent(patient_id=PID) + ae2.term = "Nausea" + ae2.start_date = dt.date(2023, 4, 1) + + patient.adverse_events = [ae1, ae2] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 2 + assert set(ctx.condition_id_by_ae_sequence_id.keys()) == {1} + + def test_multi_concept_ae_links_to_first_row(self, static_index, structural_index): + """When one AE term maps to multiple condition concepts, FK links to the first emitted row.""" + semantic = create_semantic_index( + self._ae_semantic(0, 437663, "fever"), + self._ae_semantic(0, 999999, "alternative fever concept"), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 99 + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 2 + # one FK entry: pointing to the first emitted row + assert ctx.condition_id_by_ae_sequence_id == {99: rows[0].condition_occurrence_id} + + def test_fk_publication_deterministic(self, static_index, structural_index): + """Two independent builds of the same patient produce identical FK state.""" + semantic = create_semantic_index(self._ae_semantic(0, 437663, "fever")) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 5 + patient.adverse_events = [ae] + + ctx_a = create_build_context(patient, PERSON_ID) + ctx_b = create_build_context(patient, PERSON_ID) + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_a) + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_b) + + assert ctx_a.condition_id_by_ae_sequence_id == ctx_b.condition_id_by_ae_sequence_id + assert ctx_a.condition_id_by_ae_sequence_id != {} diff --git a/tests/omop/builders/test_measurement.py b/tests/omop/builders/test_measurement.py index 61f016d..65da53f 100644 --- a/tests/omop/builders/test_measurement.py +++ b/tests/omop/builders/test_measurement.py @@ -630,7 +630,7 @@ def test_dimension_uses_precoordinated_concept(self, static_index, structural_in assert row_1.person_id == PERSON_ID assert row_1.measurement_date == dt.date(2040, 5, 1) assert row_1.measurement_datetime == dt.datetime(2040, 5, 1) - assert row_1.measurement_id == 5607913108096982206 + assert row_1.measurement_id == 5607913108096982206 # fixme: assert on expected hash from collection's natural key instead # q2 level 5 row_2 = rows[1] From 19ae7ab1e917b9b147d07b9db7878edabda428a4 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Tue, 19 May 2026 17:16:36 +0200 Subject: [PATCH 17/23] observation builder --- .../harmonization/harmonizers/impress.py | 68 +- src/omop_etl/harmonization/models/patient.py | 14 + src/omop_etl/omop/builders/observation.py | 495 +++++++++++-- src/omop_etl/omop/models/rows.py | 26 - src/omop_etl/omop/service.py | 2 + .../static_mapped/static_mapping.csv | 13 +- .../static_mapped/structural_mapping.csv | 3 + .../omop/builders/test_observation_builder.py | 653 ++++++++++++++++++ 8 files changed, 1127 insertions(+), 147 deletions(-) diff --git a/src/omop_etl/harmonization/harmonizers/impress.py b/src/omop_etl/harmonization/harmonizers/impress.py index cebb81b..15c2cc5 100644 --- a/src/omop_etl/harmonization/harmonizers/impress.py +++ b/src/omop_etl/harmonization/harmonizers/impress.py @@ -1,5 +1,4 @@ import re -from deprecated import deprecated import polars as pl from logging import getLogger @@ -180,14 +179,18 @@ def _process_has_clinical_benefit_at_week_16(self) -> pl.DataFrame | None: colname = Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16 timepoint = "V03" + # todo: consider emitting date of the benefit record + benefit = ( self.data.select( "SubjectId", "RA_RATIMRESCD", "RA_RAiMODCD", + "RA_EventId", + "RA_EventDate", "RNRSP_RNRSPCLCD", "RNRSP_EventId", - "RA_EventId", + "RNRSP_EventDate", ) .filter(pl.any_horizontal(pl.all().exclude("SubjectId").is_not_null())) .filter((pl.col("RA_EventId") == timepoint) | (pl.col("RNRSP_EventId") == timepoint)) @@ -221,24 +224,18 @@ def _process_end_of_treatment_reason(self) -> pl.DataFrame | None: def _process_evaluable_for_efficacy_analysis(self) -> pl.DataFrame | None: """ Filtering criteria: - Any patient having valid treatment for sufficient length (21 days IV, 28 days oral). - For IV cycles, the cycle end is modeled as the day before the next cycles start. - Inclusive length = next_start − start days. Length ≥ 21 qualifies. - For oral cycles, length = stop − start days; ≥ 28 qualifies. + Any patient having valid treatment for sufficient length (21 days IV, 28 days oral). + For IV cycles, the cycle end is modeled as the day before the next cycles start. + Inclusive length = next_start − start days. Length ≥ 21 qualifies. + For oral cycles, length = stop − start days; ≥ 28 qualifies. For subjects with oral drugs, the start and end date per cycle is checked directly. - If a subject has any cycle lasting 28 days or more they are marked as having sufficient treatment length + If a subject has any cycle lasting 28 days or more they are marked as having sufficient treatment length For subjects without oral drugs, cycle stop date is set to start date of next cycle and needs to last 21 days or more. - Note: this means subjects with just one cycle are marked as non-evaluable since cycle end cannot be determined. - each cycle is grouped by treatment number, any treatment having a cycle with sufficient length marks subject as evaluable. - assumes no malformed dates, because imputing would change the length. - - Old filteing criteria: - Patients marked as evaluable for efficacy analysis needs to have: - - sufficient treatment length for any cycle (21 days for IV, 28 days for oral) and *either one of*: - - tumor assessment after week 4 (patient has any tumor assessment with EventId==V04 in RA, RCNT, RTNTMNT, RNRSP) - - clinical assessment (patient has stopped treatment: EventDate from EOT sheet) + Note: this means subjects with just one cycle are marked as non-evaluable since cycle end cannot be determined. + each cycle is grouped by treatment number, any treatment having a cycle with sufficient length marks subject as evaluable. + assumes no malformed dates, because imputing would change the length. """ colname = Patient.Scalars.EVALUABLE_FOR_EFFICACY_ANALYSIS evaluability_data = self.data.select( @@ -248,16 +245,6 @@ def _process_evaluable_for_efficacy_analysis(self) -> pl.DataFrame | None: "TR_TRTNO", "TR_TRC1_DT", "TR_TRCYNCD", - # not currently used: - # "RA_EventDate", - # "RA_EventId", - # "RNRSP_EventDate", - # "RNRSP_EventId", - # "RCNT_EventDate", - # "RCNT_EventId", - # "RNTMNT_EventDate", - # "RNTMNT_EventId", - # "EOT_EventDate", ) def oral_treatment_lengths() -> pl.DataFrame: @@ -306,35 +293,6 @@ def iv_treatment_lengths() -> pl.DataFrame: return iv_sufficient_treatment_length - @deprecated - def eot_filter() -> pl.DataFrame: - has_ended_treatment = evaluability_data.group_by("SubjectId").agg( - pl.any_horizontal(PolarsParsers.to_optional_utf8(pl.col(["EOT_EventDate"])).str.len_bytes() > 0).any().alias("has_clinical_assessment"), - ) - return has_ended_treatment - - @deprecated - def tumor_assessment() -> pl.DataFrame: - # need to add V04 filter (if this is to be used again) - has_tumor_assessment_week_4 = evaluability_data.group_by("SubjectId").agg( - pl.any_horizontal( - PolarsParsers.to_optional_utf8( - pl.col( - [ - "RA_EventDate", - "RNRSP_EventDate", - "RCNT_EventDate", - "RNTMNT_EventDate", - ], - ), - ).str.len_bytes() - > 0, - ) - .any() - .alias("has_tumor_assessment"), - ) - return has_tumor_assessment_week_4 - def _merge_evaluability() -> pl.DataFrame: base = evaluability_data.select("SubjectId").unique() _merged_df: pl.DataFrame = ( diff --git a/src/omop_etl/harmonization/models/patient.py b/src/omop_etl/harmonization/models/patient.py index dc46890..9df124b 100644 --- a/src/omop_etl/harmonization/models/patient.py +++ b/src/omop_etl/harmonization/models/patient.py @@ -46,6 +46,7 @@ class Scalars: NUMBER_OF_ADVERSE_EVENTS = "number_of_adverse_events" NUMBER_OF_SERIOUS_ADVERSE_EVENTS = "number_of_serious_adverse_events" HAS_CLINICAL_BENEFIT_AT_WEEK_16 = "has_clinical_benefit_at_week_16" + CLINICAL_BENEFIT_AT_WEEK_16_DATE = "clinical_benefit_at_week_16_date" END_OF_TREATMENT_REASON = "end_of_treatment_reason" END_OF_TREATMENT_DATE = "end_of_treatment_date" @@ -86,6 +87,7 @@ def __init__(self, patient_id: str, trial_id: str): self._number_of_adverse_events: int | None = None self._number_of_serious_adverse_events: int | None = None self._has_clinical_benefit_at_week_16: bool | None = None + self._clinical_benefit_at_week_16_date: dt.date | None = None self._end_of_treatment_reason: str | None = None self._end_of_treatment_date: dt.date | None = None @@ -277,6 +279,18 @@ def has_clinical_benefit_at_week_16(self, value: bool | None) -> None: validator=StrictValidators.validate_optional_bool, ) + @property + def clinical_benefit_at_week_16_date(self) -> dt.date | None: + return self._clinical_benefit_at_week_16_date + + @clinical_benefit_at_week_16_date.setter + def clinical_benefit_at_week_16_date(self, value: dt.date | None) -> None: + self._set_validated_prop( + prop=self.__class__.clinical_benefit_at_week_16_date, + value=value, + validator=StrictValidators.validate_optional_date, + ) + @property def end_of_treatment_reason(self) -> str | None: return self._end_of_treatment_reason diff --git a/src/omop_etl/omop/builders/observation.py b/src/omop_etl/omop/builders/observation.py index 399dd6d..bea3953 100644 --- a/src/omop_etl/omop/builders/observation.py +++ b/src/omop_etl/omop/builders/observation.py @@ -1,60 +1,435 @@ -# cdm spec - -# so this table should contain all observations that are not populating: -# measurement, drug exposure, condition, procedure occurrence, device occurrence, condition occurrence etc. -# they *cannot* be of domain: Condition, Procedure, Drug, Specimen, Measurement or Device. - -# so from the patient data this leaves what domain classes / patient scalars, and what fields and domains? -# think optimally do this table last, easier that way. - -# todo: implement condition and device builders first: -# anything that lands in condition, procedure, drug, measurement or device can't go in observation - -# anyways, can use: -# cohort_name, -# evaluable_for_efficacy_analysis -# has_any_adverse_events -# number_of_adverse_events -# number_of_serious_adverse_events -# has_clinical_benefit_at_week_16 -# end_of_treatment_reason -# end_of_treatment_date -# lost_to_followup (bool, date) -# best_overall_response (but need non-measurement concepts) - -# maybe: -# AE outcome, AE was serious, AE related to treatment status, AE turned serious date, -# tumor assessment date of progression, nadir, etc? -# study drugs? - -""" -ObservationRow: - required fields: - observation_id: unique ID for each entry - person_id: person id from service - observation_concept_id: semantic mapped concept id - There is no specified domain that the Concepts in this table must adhere to. - The only rule is that records with Concepts in the Condition, Procedure, Drug, - Measurement, or Device domains MUST go to the corresponding table. - - so we grab - observation_date: date, required - observation_type_concept_id: - - optional fields: - observation_datetime: dt.datetime | None = None - value_as_number: float | None = None - value_as_string: Annotated[str | None, pd_field(max_length=60)] = None - value_as_concept_id: int | None = None - qualifier_concept_id: int | None = None - unit_concept_id: int | None = None - provider_id: int | None = None - visit_occurrence_id: int | None = None - visit_detail_id: int | None = None - observation_source_value: Annotated[str | None, pd_field(max_length=50)] = None - observation_source_concept_id: int | None = None - unit_source_value: Annotated[str | None, pd_field(max_length=50)] = None - qualifier_source_value: Annotated[str | None, pd_field(max_length=50)] = None - value_source_value: Annotated[str | None, pd_field(max_length=50)] = None - observation_event_id: int | None = None - obs_event_field_concept_id: int | None = None -""" +import datetime as dt +from logging import getLogger +from typing import ClassVar + +from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.followup import FollowUp +from omop_etl.harmonization.models.patient import Patient +from omop_etl.omop.builders.base import BuildContext, OmopBuilder +from omop_etl.omop.models.rows import ObservationRow +from omop_etl.semantic_mapping.core.models import OmopDomain + +log = getLogger(__name__) + +_WEEK_16 = dt.timedelta(weeks=16) + + +class ObservationBuilder(OmopBuilder[ObservationRow]): + """ + Builds observation rows from patient scalars, the lost-to-followup singleton, + and adverse-event-derived facts (outcome, was_serious, turned_serious_date). + All observation_concept_id domains must NOT be Condition, Procedure, Drug, + Specimen, Measurement, or Device. + + Three row shapes: + + 1. Unmapped source attribute (no topic concept available for the field): + observation_concept_id = 0, observation_source_value = source field name, + value_as_concept_id / value_as_string / value_source_value carry the + normalized + raw source value. Used for evaluable_for_efficacy_analysis, + has_clinical_benefit_at_week_16, end_of_treatment_reason. + + 2. Mapped observation topic (a Standard concept names the topic): + observation_concept_id = topic concept, observation_source_value = + source field name, value_as_concept_id carries the answer/result. + Used for lost_to_followup ("Lost to follow-up" topic). + + 3. AE-derived (shapes 1 or 2, plus FK linkage): + observation_event_id + obs_event_field_concept_id link back to the + condition_occurrence row produced by ConditionOccurrenceBuilder via + BuildContext.condition_id_by_ae_sequence_id. Used for AE outcome, + AE was_serious, AE turned_serious_date. + + Emit policy: a row is only skipped when the source value or a required + date is missing. When a concept lookup misses (topic OR value), the row + is still emitted with concept_id=0 — CDM convention for "result present + in source but unmapped" — and the raw literal is preserved in + value_source_value / observation_source_value so the fact stays + queryable. Yes/No is resolved via the `yes` / `no` structural Meas + Value concepts. + """ + + table_name: ClassVar[str] = "observation" + + def build(self, ctx: BuildContext) -> list[ObservationRow]: + patient = ctx.patient + person_id = ctx.person_id + + ecrf = self.concepts.lookup_structural("ecrf", domains={"Type Concept"}) + observation_type_concept_id = ecrf.concept_id if ecrf else 0 + + rows: list[ObservationRow] = [] + rows.extend(self._build_evaluable(patient, person_id, observation_type_concept_id)) + rows.extend(self._build_clinical_benefit(patient, person_id, observation_type_concept_id)) + rows.extend(self._build_eot_reason(patient, person_id, observation_type_concept_id)) + rows.extend(self._build_lost_to_followup(patient, person_id, observation_type_concept_id)) + + for idx, ae in enumerate(patient.adverse_events): + rows.extend(self._build_ae_outcome(patient, person_id, observation_type_concept_id, ae, idx, ctx)) + rows.extend(self._build_ae_was_serious(patient, person_id, observation_type_concept_id, ae, idx, ctx)) + rows.extend(self._build_ae_turned_serious(patient, person_id, observation_type_concept_id, ae, idx, ctx)) + + return rows + + def _yes_no_concept_id(self, value: bool) -> int: + """ + Resolve True to Yes / False to No via the structural Meas Value + concepts. Returns 0 when the mapping is missing. + """ + concept = self.concepts.lookup_structural("yes" if value else "no", domains={OmopDomain.MEAS_VALUE}) + return concept.concept_id if concept else 0 + + def _bool_observation( + self, + *, + observation_id: int, + person_id: int, + field_name: str, + value: bool, + date: dt.date, + observation_type_concept_id: int, + observation_concept_id: int = 0, + observation_event_id: int | None = None, + obs_event_field_concept_id: int | None = None, + ) -> ObservationRow: + """ + Compose a boolean observation row. Standardizes the source/value + encoding for all boolean fields (evaluable, clinical_benefit, + lost_to_followup, AE was_serious) so the columns can't drift + between sites. + """ + return ObservationRow( + observation_id=observation_id, + person_id=person_id, + observation_concept_id=observation_concept_id, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=self._yes_no_concept_id(value), + observation_source_value=field_name, + observation_source_concept_id=0, + value_source_value=str(value).lower(), + observation_event_id=observation_event_id, + obs_event_field_concept_id=obs_event_field_concept_id, + ) + + def _build_evaluable( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + value = patient.evaluable_for_efficacy_analysis + date = patient.treatment_start_date + if value is None: + return [] + if date is None: + log.warning( + "Skipping evaluable_for_efficacy_analysis for %s: missing treatment_start_date", + patient.patient_id, + ) + return [] + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Scalars.EVALUABLE_FOR_EFFICACY_ANALYSIS, + ), + person_id=person_id, + field_name=Patient.Scalars.EVALUABLE_FOR_EFFICACY_ANALYSIS, + value=value, + date=date, + observation_type_concept_id=observation_type_concept_id, + ) + ] + + def _build_clinical_benefit( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + """ + Clinical benefit at W16. Uses the dedicated + `clinical_benefit_at_week_16_date` scalar if set, otherwise falls back + to `treatment_start_date + 16 weeks`. + todo: Switch to a ClinicalBenefit singleton when extending to other timepoints. + """ + value = patient.has_clinical_benefit_at_week_16 + if value is None: + return [] + + date = patient.clinical_benefit_at_week_16_date + if date is None: + start = patient.treatment_start_date + if start is None: + log.warning( + "Skipping has_clinical_benefit_at_week_16 for %s: no clinical_benefit_at_week_16_date and no treatment_start_date", + patient.patient_id, + ) + return [] + date = start + _WEEK_16 + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16, + ), + person_id=person_id, + field_name=Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16, + value=value, + date=date, + observation_type_concept_id=observation_type_concept_id, + ) + ] + + def _build_eot_reason( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + """ + Shape 1 (unmapped source attribute): observation_concept_id = 0, + observation_source_value = field name, value_as_concept_id = mapped + reason (or 0 if unmapped), value_as_string + value_source_value + preserve the raw reason text. + """ + reason = patient.end_of_treatment_reason + date = patient.end_of_treatment_date + if reason is None: + return [] + if date is None: + log.warning("Skipping end_of_treatment_reason for %s: missing end_of_treatment_date", patient.patient_id) + return [] + + concept = self.concepts.lookup_static("eot_reason", reason) + + return [ + ObservationRow( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Scalars.END_OF_TREATMENT_REASON, + ), + person_id=person_id, + observation_concept_id=0, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=concept.concept_id if concept else 0, + value_as_string=reason[:60], + observation_source_value=Patient.Scalars.END_OF_TREATMENT_REASON, + observation_source_concept_id=0, + value_source_value=reason[:50], + ) + ] + + def _build_lost_to_followup( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + followup = patient.lost_to_followup + if followup is None: + return [] + + value = followup.lost_to_followup + date = followup.date_lost_to_followup + if value is None: + return [] + if date is None: + log.warning("Skipping lost_to_followup for %s: missing date_lost_to_followup", patient.patient_id) + return [] + + concept = self.concepts.lookup_static(FollowUp.Fields.LOST_TO_FOLLOWUP, str(value)) + observation_concept_id = concept.concept_id if concept else 0 + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Singletons.LOST_TO_FOLLOWUP, + *followup.natural_key(), + ), + person_id=person_id, + field_name=Patient.Singletons.LOST_TO_FOLLOWUP, + value=value, + date=date, + observation_type_concept_id=observation_type_concept_id, + observation_concept_id=observation_concept_id, + ) + ] + + def _build_ae_outcome( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ae: AdverseEvent, + index: int, + ctx: BuildContext, + ) -> list[ObservationRow]: + """ + Shape 2 or 3: topic concept (structural `adverse_event_outcome`) + + answer concept (static `adverse_event_outcome,`). Both + lookups fall back to 0 when missing — the row is still emitted as + long as outcome and start_date are present, with the raw value + preserved in value_source_value. + """ + raw_outcome = ae.outcome + date = ae.start_date + if raw_outcome is None: + return [] + if date is None: + log.warning("Skipping AE %d outcome for %s: missing start_date", index, patient.patient_id) + return [] + + topic_concept = self.concepts.lookup_structural("adverse_event_outcome") + outcome_concept = self.concepts.lookup_static("adverse_event_outcome", raw_outcome) + + event_id, field_concept_id = self._ae_fk(ae, patient, index, ctx) + + return [ + ObservationRow( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.ADVERSE_EVENTS, + *ae.natural_key(), + AdverseEvent.Fields.OUTCOME, + ), + person_id=person_id, + observation_concept_id=topic_concept.concept_id if topic_concept else 0, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=outcome_concept.concept_id if outcome_concept else 0, + observation_source_value=AdverseEvent.Fields.OUTCOME, + observation_source_concept_id=0, + value_source_value=str(raw_outcome)[:50], + observation_event_id=event_id, + obs_event_field_concept_id=field_concept_id, + ) + ] + + def _build_ae_was_serious( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ae: AdverseEvent, + index: int, + ctx: BuildContext, + ) -> list[ObservationRow]: + was_serious = ae.was_serious + if was_serious is None: + return [] + date = ae.start_date + if date is None: + log.warning("Skipping AE %d was_serious for %s: missing start_date", index, patient.patient_id) + return [] + + event_id, field_concept_id = self._ae_fk(ae, patient, index, ctx) + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.ADVERSE_EVENTS, + *ae.natural_key(), + AdverseEvent.Fields.WAS_SERIOUS, + ), + person_id=person_id, + field_name=AdverseEvent.Fields.WAS_SERIOUS, + value=was_serious, + date=date, + observation_type_concept_id=observation_type_concept_id, + observation_event_id=event_id, + obs_event_field_concept_id=field_concept_id, + ) + ] + + def _build_ae_turned_serious( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ae: AdverseEvent, + index: int, + ctx: BuildContext, + ) -> list[ObservationRow]: + """ + AE turned-serious flag. Encoded as a Yes observation on + `turned_serious_date`; value_source_value carries the ISO date so + consumers can reconstruct the event without re-querying. + Not using _bool_observation because value_source_value differs + (date string, not "true"). + """ + date = ae.turned_serious_date + if date is None: + return [] + + event_id, field_concept_id = self._ae_fk(ae, patient, index, ctx) + + return [ + ObservationRow( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.ADVERSE_EVENTS, + *ae.natural_key(), + AdverseEvent.Fields.TURNED_SERIOUS_DATE, + ), + person_id=person_id, + observation_concept_id=0, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=self._yes_no_concept_id(True), + observation_source_value=AdverseEvent.Fields.TURNED_SERIOUS_DATE, + observation_source_concept_id=0, + value_source_value=date.isoformat(), + observation_event_id=event_id, + obs_event_field_concept_id=field_concept_id, + ) + ] + + def _ae_fk( + self, + ae: AdverseEvent, + patient: Patient, + index: int, + ctx: BuildContext, + ) -> tuple[int | None, int | None]: + """ + Resolve (observation_event_id, obs_event_field_concept_id) for an + AE-derived observation row. Returns (None, None) when the AE has no + sequence_id or no published condition_occurrence row. Raises if the + `cdm_field` static entry for condition_occurrence.condition_occurrence_id + is missing — this is required infrastructure for AE-attributed + observations. + """ + sequence_id = ae.sequence_id + if sequence_id is None: + log.warning( + "AE %d for %s missing sequence_id: cannot link observation to condition_occurrence", + index, + patient.patient_id, + ) + return None, None + + event_id = ctx.condition_id_by_ae_sequence_id.get(sequence_id) + if event_id is None: + log.warning( + "AE %d for %s missing event_id: cannot link observation to condition_occurrence", + index, + patient.patient_id, + ) + return None, None + + field_concept = self.concepts.lookup_static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + domains={"Metadata"}, + ) + if field_concept is None: + raise RuntimeError("Missing cdm_field mapping for condition_occurrence.condition_occurrence_id") + + return event_id, field_concept.concept_id diff --git a/src/omop_etl/omop/models/rows.py b/src/omop_etl/omop/models/rows.py index a558256..cd3bf8c 100644 --- a/src/omop_etl/omop/models/rows.py +++ b/src/omop_etl/omop/models/rows.py @@ -271,29 +271,3 @@ class ObservationRow: def validate(self): validate_required_fields(self) - - -@pd_dataclass(frozen=True, slots=True) -class DeviceExposureRow: - device_exposure_id: int - person_id: int - device_concept_id: int - device_exposure_start_date: dt.date - device_type_concept_id: int - device_exposure_start_datetime: dt.datetime | None = None - device_exposure_end_date: dt.date | None = None - device_exposure_end_datetime: dt.datetime | None = None - unique_device_id: Annotated[str | None, pd_field(max_length=255)] = None - production_id: Annotated[str | None, pd_field(max_length=255)] = None - quantity: int | None = None - provider_id: int | None = None - visit_occurrence_id: int | None = None - visit_detail_id: int | None = None - device_source_value: Annotated[str | None, pd_field(max_length=255)] = None - device_source_concept_id: int | None = None - unit_concept_id: int | None = None - unit_source_value: int | None = None - unit_source_concept_id: int | None = None - - def validate(self): - validate_required_fields(self) diff --git a/src/omop_etl/omop/service.py b/src/omop_etl/omop/service.py index e6b570b..bcb6c09 100644 --- a/src/omop_etl/omop/service.py +++ b/src/omop_etl/omop/service.py @@ -5,6 +5,7 @@ from omop_etl.omop.builders.base import OmopBuilder, BuildContext from omop_etl.omop.builders.condition_occurrence import ConditionOccurrenceBuilder from omop_etl.omop.builders.measurement import MeasurementBuilder +from omop_etl.omop.builders.observation import ObservationBuilder from omop_etl.omop.builders.person import PersonBuilder from omop_etl.omop.builders.observation_period import ObservationPeriodBuilder from omop_etl.omop.builders.cdm_source import CdmSourceBuilder @@ -37,6 +38,7 @@ def __init__(self, concepts: ConceptLookupService): ConditionOccurrenceBuilder(concepts), ProcedureOccurrenceBuilder(concepts), MeasurementBuilder(concepts), + ObservationBuilder(concepts), ] def build(self, patients: Sequence[Patient]) -> OmopTables: diff --git a/src/omop_etl/resources/static_mapped/static_mapping.csv b/src/omop_etl/resources/static_mapped/static_mapping.csv index 41f758b..8930eb5 100644 --- a/src/omop_etl/resources/static_mapped/static_mapping.csv +++ b/src/omop_etl/resources/static_mapped/static_mapping.csv @@ -1,5 +1,6 @@ value_set,local_value,omop_concept_id,omop_concept_code,omop_concept_name,omop_concept_class,omop_standard_concept,omop_validity,omop_domain,omop_vocab lost_to_followup,True,4163894,399307001,Lost to follow-up,Clinical Finding,Standard,Valid,Observation,SNOMED +cdm_field,condition_occurrence.condition_occurrence_id,1147127,CDM183,condition_occurrence.condition_occurrence_id,Field,Standard,Valid,Metadata,CDM sex,f,8532,F,FEMALE,Gender,Standard,Valid,Gender,Gender sex,m,8507,M,MALE,Gender,Standard,Valid,Gender,Gender sex,female,8532,F,FEMALE,Gender,Standard,Valid,Gender,Gender @@ -132,15 +133,15 @@ eq5d_q5_answer_code,4,742369,OMOP5181579,EuroQol five dimension five level anxie eq5d_q5_answer_code,5,742370,OMOP5181580,EuroQol five dimension five level anxiety depression score: 5 (I am extremely anxious or depressed),Precoordinated pair,Standard,Valid,Measurement,OMOP Extension eot_reason,Disease progression,1617595,97509-4,Cancer disease progression,Clinical Observation,Standard,Valid,Observation,LOINC eot_reason,Normal completion according to cohort-specific manual,44788181,15501000000100,Completed successfully,Qualifier Value,Standard,Valid,Observation,SNOMED -eot_reason,Other,35821954,100418-5,Other reason,Answer,Standard,Valid,Observation,UK Biobank -eot_reason,Adverse event/Toxicity,441207,62014003,Adverse reaction to drug,Disorder,Standard,Valid,Observation,SNOMED +eot_reason,Other,9177,74964007,Other,Qualifier Value,Standard,Valid,Meas Value,SNOMED +eot_reason,Adverse event/Toxicity,45884383,LA7266-5,Adverse event,Answer,Standard,Valid,Meas Value,LOINC eot_reason,Symptomatic deterioration,4111347,285384003,General health deterioration,Clinical Finding,Standard,Valid,Observation,SNOMED -eot_reason,Patient refusal,45773084,703427001,Refusal of treatment by patient against dental advice,Context-dependent,Standard,Valid,Observation,SNOMED +eot_reason,Patient refusal,45878680,LA4389-8,Refused,Answer,Standard,Valid,Meas Value,LOINC eot_reason,Withdrawn by PI,44810920,871401000000109,Withdrawn from research study,Clinical Finding,Standard,Valid,Observation,SNOMED eot_reason_code,2,1617595,97509-4,Cancer disease progression,Clinical Observation,Standard,Valid,Observation,LOINC eot_reason_code,7,44788181,15501000000100,Completed successfully,Qualifier Value,Standard,Valid,Observation,SNOMED -eot_reason_code,88,35821954,100418-5,Other reason,Answer,Standard,Valid,Observation,UK Biobank -eot_reason_code,1,441207,62014003,Adverse reaction to drug,Disorder,Standard,Valid,Observation,SNOMED +eot_reason_code,88,9177,74964007,Other,Qualifier Value,Standard,Valid,Meas Value,SNOMED +eot_reason_code,1,45884383,LA7266-5,Adverse event,Answer,Standard,Valid,Meas Value,LOINC eot_reason_code,6,4111347,285384003,General health deterioration,Clinical Finding,Standard,Valid,Observation,SNOMED -eot_reason_code,3,45773084,703427001,Refusal of treatment by patient against dental advice,Context-dependent,Standard,Valid,Observation,SNOMED +eot_reason_code,3,45878680,LA4389-8,Refused,Answer,Standard,Valid,Meas Value,LOINC eot_reason_code,5,44810920,871401000000109,Withdrawn from research study,Clinical Finding,Standard,Valid,Observation,SNOMED \ No newline at end of file diff --git a/src/omop_etl/resources/static_mapped/structural_mapping.csv b/src/omop_etl/resources/static_mapped/structural_mapping.csv index 1207c80..e44d70b 100644 --- a/src/omop_etl/resources/static_mapped/structural_mapping.csv +++ b/src/omop_etl/resources/static_mapped/structural_mapping.csv @@ -1,4 +1,7 @@ value_set,omop_concept_id,omop_concept_code,omop_concept_name,omop_concept_class,omop_standard_concept,omop_validity,omop_domain,omop_vocab +no,4188540,373067005,No,Qualifier Value,Standard,Valid,Meas Value,SNOMED +yes,4188539,373066001,Yes,Qualifier Value,Standard,Valid,Meas Value,SNOMED +adverse_event_outcome,4231813,405533003,Adverse incident outcome,Clinical Finding,Standard,Valid,Observation,SNOMED ecog,36305384,89247-1,ECOG Performance Status score,Clinical Observation,Standard,Valid,Measurement,LOINC lesion_size,4084390,246116008,Lesion size,Observable Entity,Standard,Valid,Measurement,SNOMED number_of_lesions,4085855,246206008,Number of lesions,Observable Entity,Standard,Valid,Observation,SNOMED diff --git a/tests/omop/builders/test_observation_builder.py b/tests/omop/builders/test_observation_builder.py index e69de29..0bf1ac8 100644 --- a/tests/omop/builders/test_observation_builder.py +++ b/tests/omop/builders/test_observation_builder.py @@ -0,0 +1,653 @@ +import datetime as dt +import logging + +import pytest + +from omop_etl.concept_mapping.service import ConceptLookupService +from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.followup import FollowUp +from omop_etl.harmonization.models.patient import Patient +from omop_etl.omop.builders.observation import ObservationBuilder +from omop_etl.omop.core.id_generator import sha1_bigint +from tests.omop.conftest import ( + _static, + _structural, + create_build_context, + create_patient, +) + +PID = "p1" +TRIAL = "test" +PERSON_ID = sha1_bigint("person", PID) + +YES_CID = 4188539 +NO_CID = 4188540 +CDM_FIELD_CID = 1147127 +AE_OUTCOME_TOPIC_CID = 4231813 + + +def _with_yes_no(structural_index: dict) -> dict: + """Yes/No structural Meas Value concepts (OHDSI ETL convention for booleans).""" + structural_index["yes"] = _structural("yes", YES_CID, "meas value") + structural_index["no"] = _structural("no", NO_CID, "meas value") + return structural_index + + +def _with_cdm_field(static_index: dict) -> dict: + """`cdm_field` static entry for AE → condition_occurrence FK linkage.""" + static_index[("cdm_field", "condition_occurrence.condition_occurrence_id")] = _static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + CDM_FIELD_CID, + "metadata", + ) + return static_index + + +def _with_ae_outcome_topic(structural_index: dict) -> dict: + structural_index["adverse_event_outcome"] = _structural("adverse_event_outcome", AE_OUTCOME_TOPIC_CID, "observation") + return structural_index + + +class TestObservationBuilder: + def test_table_name(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + assert ObservationBuilder(concepts).table_name == "observation" + + def test_empty_patient_returns_empty(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + +class TestEvaluableForEfficacy: + """Shape 1 (unmapped source attribute): concept_id=0, source_value=field + name, value_source_value=lowercase literal.""" + + def test_true_emits_row_with_yes_value(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + evaluable_for_efficacy_analysis=True, + treatment_start_date=dt.date(2023, 1, 10), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 1, 10) + assert row.observation_type_concept_id == 32817 + assert row.observation_source_value == "evaluable_for_efficacy_analysis" + assert row.observation_source_concept_id == 0 + assert row.value_as_concept_id == YES_CID + assert row.value_source_value == "true" + + def test_false_emits_row_with_no_value(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + evaluable_for_efficacy_analysis=False, + treatment_start_date=dt.date(2023, 1, 10), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].value_source_value == "false" + + def test_yes_no_missing_falls_back_to_zero(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + evaluable_for_efficacy_analysis=True, + treatment_start_date=dt.date(2023, 1, 10), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == 0 + + def test_skipped_when_value_is_none(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, treatment_start_date=dt.date(2023, 1, 10)) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_skipped_when_treatment_start_date_missing(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, evaluable_for_efficacy_analysis=True) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("treatment_start_date" in rec.message for rec in caplog.records) + + +class TestClinicalBenefit: + """Shape 1, like evaluable. Prefers `clinical_benefit_at_week_16_date` + scalar; falls back to treatment_start + 16w.""" + + def test_uses_clinical_benefit_date_scalar_when_set(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + has_clinical_benefit_at_week_16=True, + clinical_benefit_at_week_16_date=dt.date(2023, 4, 20), + treatment_start_date=dt.date(2023, 1, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 4, 20) + assert row.observation_source_value == "has_clinical_benefit_at_week_16" + assert row.value_as_concept_id == YES_CID + assert row.value_source_value == "true" + + def test_falls_back_to_treatment_start_plus_16_weeks(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + has_clinical_benefit_at_week_16=True, + treatment_start_date=dt.date(2023, 1, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].observation_date == dt.date(2023, 1, 1) + dt.timedelta(weeks=16) + + def test_skipped_when_value_is_none(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, treatment_start_date=dt.date(2023, 1, 1)) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_skipped_when_no_date_available(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, has_clinical_benefit_at_week_16=True) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("clinical_benefit_at_week_16_date" in rec.message for rec in caplog.records) + + +class TestEndOfTreatmentReason: + """Shape 1: concept_id=0, field name in source_value, mapped reason concept + in value_as_concept_id (or 0 if unmapped), raw reason preserved in both + value_as_string and value_source_value.""" + + def test_mapped_reason_emits_row(self, static_index, structural_index): + static_index[("eot_reason", "disease progression")] = _static("eot_reason", "disease progression", 1617595, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + end_of_treatment_reason="Disease progression", + end_of_treatment_date=dt.date(2023, 8, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 8, 1) + assert row.observation_source_value == "end_of_treatment_reason" + assert row.observation_source_concept_id == 0 + assert row.value_as_concept_id == 1617595 + assert row.value_as_string == "Disease progression" + assert row.value_source_value == "Disease progression" + + def test_unmapped_reason_emits_row_with_value_concept_zero(self, static_index, structural_index): + """No static mapping → row still emits, value_as_concept_id=0, raw + reason preserved in value_as_string + value_source_value.""" + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + end_of_treatment_reason="Some new reason not in mapping", + end_of_treatment_date=dt.date(2023, 8, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.value_as_concept_id == 0 + assert row.value_as_string == "Some new reason not in mapping" + assert row.value_source_value == "Some new reason not in mapping"[:50] + assert row.observation_source_value == "end_of_treatment_reason" + + def test_skipped_without_eot_date(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, end_of_treatment_reason="Disease progression") + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("end_of_treatment_date" in rec.message for rec in caplog.records) + + +class TestLostToFollowup: + """Shape 2 (mapped topic): concept_id=Lost-to-follow-up, + source_value=field name, value_source_value=lowercase literal.""" + + def test_lost_to_followup_true_emits_row(self, static_index, structural_index): + _with_yes_no(structural_index) + static_index[("lost_to_followup", "true")] = _static("lost_to_followup", "true", 4163894, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = True + followup.date_lost_to_followup = dt.date(2023, 12, 1) + patient.lost_to_followup = followup + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 4163894 + assert row.observation_date == dt.date(2023, 12, 1) + assert row.value_as_concept_id == YES_CID + assert row.observation_source_value == "lost_to_followup" + assert row.observation_source_concept_id == 0 + assert row.value_source_value == "true" + + def test_lost_to_followup_false_emits_row_with_no_value(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = False + followup.date_lost_to_followup = dt.date(2023, 12, 1) + patient.lost_to_followup = followup + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].observation_concept_id == 0 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].observation_source_value == "lost_to_followup" + assert rows[0].value_source_value == "false" + + def test_singleton_absent_returns_empty(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_missing_date_skips(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = True + patient.lost_to_followup = followup + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("date_lost_to_followup" in rec.message for rec in caplog.records) + + +class TestAdverseEventOutcome: + """Shape 2/3: topic concept (structural `adverse_event_outcome`) + + answer concept (static `adverse_event_outcome,`). Either lookup + can miss and the row still emits with concept_id=0 fallback, as long + as outcome and start_date are present. FK-linked.""" + + def _make_patient(self, outcome: str | None) -> Patient: + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.outcome = outcome + ae.sequence_id = 1 + patient.adverse_events = [ae] + return patient + + def test_mapped_outcome_emits_row(self, static_index, structural_index): + _with_ae_outcome_topic(structural_index) + _with_cdm_field(static_index) + static_index[("adverse_event_outcome", "recovering/resolving")] = _static("adverse_event_outcome", "recovering/resolving", 1074213, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Recovering/resolving") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == AE_OUTCOME_TOPIC_CID + assert row.observation_date == dt.date(2023, 5, 1) + assert row.value_as_concept_id == 1074213 + assert row.observation_source_value == "outcome" + assert row.observation_source_concept_id == 0 + assert row.value_source_value == "Recovering/resolving" + assert row.observation_event_id == 999 + assert row.obs_event_field_concept_id == CDM_FIELD_CID + + def test_topic_structural_missing_falls_back_to_zero(self, static_index, structural_index): + """No topic structural → concept_id=0 but row still emits with mapped + value and raw outcome preserved in value_source_value.""" + _with_cdm_field(static_index) + static_index[("adverse_event_outcome", "recovering/resolving")] = _static("adverse_event_outcome", "recovering/resolving", 1074213, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Recovering/resolving") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.value_as_concept_id == 1074213 + assert row.value_source_value == "Recovering/resolving" + + def test_value_static_missing_falls_back_to_zero(self, static_index, structural_index): + """No static mapping for the outcome text → value_as_concept_id=0, + row still emits with topic concept and raw outcome preserved.""" + _with_ae_outcome_topic(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Some unmapped outcome") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == AE_OUTCOME_TOPIC_CID + assert row.value_as_concept_id == 0 + assert row.value_source_value == "Some unmapped outcome" + + def test_both_lookups_missing_emits_zero_row_with_raw_value(self, static_index, structural_index): + """Worst case: no mappings at all, row still emits with both + concept ids 0 and value_source_value preserving the raw text.""" + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Some outcome") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.value_as_concept_id == 0 + assert row.value_source_value == "Some outcome" + assert row.observation_source_value == "outcome" + + def test_outcome_none_emits_nothing(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(None) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + +class TestAdverseEventWasSerious: + """Shape 3: concept_id=0 + FK linkage. Emits for both True and False + (records the assessment either way).""" + + def _make_patient(self, was_serious: bool | None) -> Patient: + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.sequence_id = 42 + ae.was_serious = was_serious + patient.adverse_events = [ae] + return patient + + def test_was_serious_true_emits_row_with_fk(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(True) + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[42] = 123456789 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 5, 1) + assert row.value_as_concept_id == YES_CID + assert row.observation_source_value == "was_serious" + assert row.observation_source_concept_id == 0 + assert row.value_source_value == "true" + assert row.observation_event_id == 123456789 + assert row.obs_event_field_concept_id == CDM_FIELD_CID + + def test_was_serious_false_emits_row_with_no_value(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(False) + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[42] = 123456789 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].value_source_value == "false" + + def test_was_serious_none_emits_nothing(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(None) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_no_fk_when_no_condition_row_published(self, static_index, structural_index, caplog): + """AE with sequence_id but no published condition_occurrence row: + observation still emits, FK fields left blank, warning logged.""" + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(True) + ctx = create_build_context(patient, PERSON_ID) + # condition_id_by_ae_sequence_id stays empty + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + assert rows[0].observation_event_id is None + assert rows[0].obs_event_field_concept_id is None + assert any("missing event_id" in rec.message for rec in caplog.records) + + def test_no_fk_when_ae_missing_sequence_id(self, static_index, structural_index, caplog): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.was_serious = True + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + assert rows[0].observation_event_id is None + assert rows[0].obs_event_field_concept_id is None + assert any("missing sequence_id" in rec.message for rec in caplog.records) + + def test_raises_when_cdm_field_missing_but_fk_resolvable(self, static_index, structural_index): + """cdm_field is required infrastructure: builder raises rather than + emit a partially-linked row when the static entry is missing.""" + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(True) + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[42] = 987654321 + + with pytest.raises(RuntimeError, match="cdm_field"): + ObservationBuilder(concepts).build(ctx) + + +class TestAdverseEventTurnedSerious: + def test_emits_row_on_turned_serious_date(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.sequence_id = 7 + ae.turned_serious_date = dt.date(2023, 5, 5) + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[7] = 555 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 5, 5) + assert row.value_as_concept_id == YES_CID + assert row.observation_source_value == "turned_serious_date" + assert row.value_source_value == "2023-05-05" + assert row.observation_event_id == 555 + assert row.obs_event_field_concept_id == CDM_FIELD_CID + + def test_skipped_when_turned_serious_date_unset(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.sequence_id = 7 + patient.adverse_events = [ae] + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + +class TestCombinedSources: + def test_multi_source_uniqueness_and_determinism(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + _with_ae_outcome_topic(structural_index) + static_index[("eot_reason", "other")] = _static("eot_reason", "other", 35821954, "observation") + static_index[("lost_to_followup", "true")] = _static("lost_to_followup", "true", 4163894, "observation") + static_index[("adverse_event_outcome", "fatal")] = _static("adverse_event_outcome", "fatal", 4236718, "observation") + + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + treatment_start_date=dt.date(2023, 1, 10), + evaluable_for_efficacy_analysis=True, + has_clinical_benefit_at_week_16=False, + end_of_treatment_reason="Other", + end_of_treatment_date=dt.date(2023, 8, 1), + ) + + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = True + followup.date_lost_to_followup = dt.date(2023, 9, 1) + patient.lost_to_followup = followup + + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.outcome = "Fatal" + ae.was_serious = True + ae.turned_serious_date = dt.date(2023, 5, 5) + ae.sequence_id = 11 + patient.adverse_events = [ae] + + ctx_a = create_build_context(patient, PERSON_ID) + ctx_a.condition_id_by_ae_sequence_id[11] = 42 + ctx_b = create_build_context(patient, PERSON_ID) + ctx_b.condition_id_by_ae_sequence_id[11] = 42 + + rows_a = ObservationBuilder(concepts).build(ctx_a) + rows_b = ObservationBuilder(concepts).build(ctx_b) + + # 4 scalars/singleton (evaluable + clinical_benefit + eot + lost_to_followup) + # + 3 AE-derived (outcome, was_serious, turned_serious) = 7 rows + assert len(rows_a) == 7 + ids = [r.observation_id for r in rows_a] + assert len(ids) == len(set(ids)), "All observation_ids must be unique" + + ids_b = sorted(r.observation_id for r in rows_b) + assert sorted(ids) == ids_b + + def test_multiple_adverse_events_each_independent(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + ae1 = AdverseEvent(patient_id=PID) + ae1.term = "Fever" + ae1.start_date = dt.date(2023, 5, 1) + ae1.sequence_id = 1 + ae1.was_serious = True + + ae2 = AdverseEvent(patient_id=PID) + ae2.term = "Nausea" + ae2.start_date = dt.date(2023, 6, 1) + ae2.sequence_id = 2 + ae2.was_serious = True + + patient.adverse_events = [ae1, ae2] + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 100 + ctx.condition_id_by_ae_sequence_id[2] = 200 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 2 + by_event_id = {r.observation_event_id: r for r in rows} + assert set(by_event_id.keys()) == {100, 200} + assert by_event_id[100].observation_date == dt.date(2023, 5, 1) + assert by_event_id[200].observation_date == dt.date(2023, 6, 1) From f7488dec17d4fbe27f76b902afdf4ec9f2a13c80 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Wed, 20 May 2026 14:37:45 +0200 Subject: [PATCH 18/23] docstrings --- src/omop_etl/omop/builders/observation.py | 111 +++++++++++++--------- 1 file changed, 67 insertions(+), 44 deletions(-) diff --git a/src/omop_etl/omop/builders/observation.py b/src/omop_etl/omop/builders/observation.py index bea3953..6a39d88 100644 --- a/src/omop_etl/omop/builders/observation.py +++ b/src/omop_etl/omop/builders/observation.py @@ -11,52 +11,57 @@ log = getLogger(__name__) -_WEEK_16 = dt.timedelta(weeks=16) - class ObservationBuilder(OmopBuilder[ObservationRow]): """ Builds observation rows from patient scalars, the lost-to-followup singleton, and adverse-event-derived facts (outcome, was_serious, turned_serious_date). - All observation_concept_id domains must NOT be Condition, Procedure, Drug, + All observation_concept_id domains must not be in Condition, Procedure, Drug, Specimen, Measurement, or Device. - Three row shapes: - - 1. Unmapped source attribute (no topic concept available for the field): - observation_concept_id = 0, observation_source_value = source field name, - value_as_concept_id / value_as_string / value_source_value carry the - normalized + raw source value. Used for evaluable_for_efficacy_analysis, - has_clinical_benefit_at_week_16, end_of_treatment_reason. - - 2. Mapped observation topic (a Standard concept names the topic): - observation_concept_id = topic concept, observation_source_value = - source field name, value_as_concept_id carries the answer/result. - Used for lost_to_followup ("Lost to follow-up" topic). - - 3. AE-derived (shapes 1 or 2, plus FK linkage): - observation_event_id + obs_event_field_concept_id link back to the - condition_occurrence row produced by ConditionOccurrenceBuilder via - BuildContext.condition_id_by_ae_sequence_id. Used for AE outcome, - AE was_serious, AE turned_serious_date. - - Emit policy: a row is only skipped when the source value or a required - date is missing. When a concept lookup misses (topic OR value), the row - is still emitted with concept_id=0 — CDM convention for "result present - in source but unmapped" — and the raw literal is preserved in - value_source_value / observation_source_value so the fact stays - queryable. Yes/No is resolved via the `yes` / `no` structural Meas - Value concepts. + + There are three patterns used: + + 1. For evaluable_for_efficacy_analysis, has_clinical_benfit_at_week_* + and end_of_treatment_reason there is no observation_concept_id, + it's set to 0. The source field name is tracked in observation_source_value, + and value_as_concept_id, value_as_string and value_source_value has + the raw and normalized source values. + + 2. For lost_to_followup the observation_concept_id is mapped, + observation_source_value has the field name, and value_as_concept_id has + the result (answer). + + 3. For AE-derived fields, AE outcome, AE was_serious and AE turned_serious_date, + the same occurs as the first two patterns, but they are linked back to the + source AE record from ConditionOccurrenceBuilder, + using FKs stored in observation_event_id and obs_event_field_concept_id, + produced by BuildContext.condition_id_by_ae_sequence_id. + + A row is only skipped when the source value or a required date is missing. + When a concept lookup misses, the row is still emitted with concept_id=0, + and the raw literal is stored in value_source_value or observation_source_value. """ table_name: ClassVar[str] = "observation" + week_16: ClassVar[dt.timedelta] = dt.timedelta(weeks=16) def build(self, ctx: BuildContext) -> list[ObservationRow]: + """ + Emit observation rows for the patient. Order: scalar attributes + (evaluable, clinical_benefit, eot_reason), the lost_to_followup + singleton, then per-AE rows (outcome, was_serious, turned_serious_date). + observation_type_concept_id is the ecrf Type Concept, raises if the + structural entry is missing. + """ patient = ctx.patient person_id = ctx.person_id ecrf = self.concepts.lookup_structural("ecrf", domains={"Type Concept"}) - observation_type_concept_id = ecrf.concept_id if ecrf else 0 + if ecrf is None: + raise RuntimeError("Missing ecrf concept in structural mapping") + + observation_type_concept_id = ecrf.concept_id rows: list[ObservationRow] = [] rows.extend(self._build_evaluable(patient, person_id, observation_type_concept_id)) @@ -73,7 +78,7 @@ def build(self, ctx: BuildContext) -> list[ObservationRow]: def _yes_no_concept_id(self, value: bool) -> int: """ - Resolve True to Yes / False to No via the structural Meas Value + Resolve True to Yes and False to No via the structural Meas Value concepts. Returns 0 when the mapping is missing. """ concept = self.concepts.lookup_structural("yes" if value else "no", domains={OmopDomain.MEAS_VALUE}) @@ -93,10 +98,10 @@ def _bool_observation( obs_event_field_concept_id: int | None = None, ) -> ObservationRow: """ - Compose a boolean observation row. Standardizes the source/value + Compose a boolean observation row. Standardizes the source-value encoding for all boolean fields (evaluable, clinical_benefit, lost_to_followup, AE was_serious) so the columns can't drift - between sites. + between callsites. """ return ObservationRow( observation_id=observation_id, @@ -118,6 +123,12 @@ def _build_evaluable( person_id: int, observation_type_concept_id: int, ) -> list[ObservationRow]: + """ + Unmapped source attribute: observation_concept_id = 0, + observation_source_value = field name, value_as_concept_id = Yes/No. + Dated to treatment_start_date (no clearer event date exists; the + evaluability decision is informed by treatment activity since start). + """ value = patient.evaluable_for_efficacy_analysis date = patient.treatment_start_date if value is None: @@ -153,7 +164,7 @@ def _build_clinical_benefit( Clinical benefit at W16. Uses the dedicated `clinical_benefit_at_week_16_date` scalar if set, otherwise falls back to `treatment_start_date + 16 weeks`. - todo: Switch to a ClinicalBenefit singleton when extending to other timepoints. + todo: Switch to a ClinicalBenefit singleton, fallback to w16 date if missing date from singleton """ value = patient.has_clinical_benefit_at_week_16 if value is None: @@ -168,7 +179,7 @@ def _build_clinical_benefit( patient.patient_id, ) return [] - date = start + _WEEK_16 + date = start + self.week_16 return [ self._bool_observation( @@ -191,9 +202,9 @@ def _build_eot_reason( observation_type_concept_id: int, ) -> list[ObservationRow]: """ - Shape 1 (unmapped source attribute): observation_concept_id = 0, + Unmapped source attribute: observation_concept_id = 0, observation_source_value = field name, value_as_concept_id = mapped - reason (or 0 if unmapped), value_as_string + value_source_value + reason (or 0 if unmapped), value_as_string and value_source_value preserve the raw reason text. """ reason = patient.end_of_treatment_reason @@ -230,6 +241,12 @@ def _build_lost_to_followup( person_id: int, observation_type_concept_id: int, ) -> list[ObservationRow]: + """ + observation_concept_id is the "Lost to follow-up" concept from the + lost_to_followup static value set, falls back to 0 when the mapping is missing. + value_as_concept_id is the Yes/No concept, observation_source_value is the field name and + value_source_value carries the boolean literal. Date is date_lost_to_followup. + """ followup = patient.lost_to_followup if followup is None: return [] @@ -271,11 +288,11 @@ def _build_ae_outcome( ctx: BuildContext, ) -> list[ObservationRow]: """ - Shape 2 or 3: topic concept (structural `adverse_event_outcome`) + - answer concept (static `adverse_event_outcome,`). Both - lookups fall back to 0 when missing — the row is still emitted as + Topic concept is the structural lookup for adverse_event_outcome, + answer concept is the static lookup for adverse_event_outcome values. + Both lookups fall back to 0 when missing and the row is still emitted as long as outcome and start_date are present, with the raw value - preserved in value_source_value. + preserved in value_source_value, linked to Condition AE record. """ raw_outcome = ae.outcome date = ae.start_date @@ -320,6 +337,13 @@ def _build_ae_was_serious( index: int, ctx: BuildContext, ) -> list[ObservationRow]: + """ + Unmapped source attribute and AE FK: observation_concept_id = 0, + observation_source_value = "was_serious", value_as_concept_id = Yes/No concept, + observation_event_id and obs_event_field_concept_id point at the + AE's condition_occurrence row. Emits for both True and False so the + explicit assessment is preserved. Dated is AE.start_date. + """ was_serious = ae.was_serious if was_serious is None: return [] @@ -359,7 +383,7 @@ def _build_ae_turned_serious( ) -> list[ObservationRow]: """ AE turned-serious flag. Encoded as a Yes observation on - `turned_serious_date`; value_source_value carries the ISO date so + turned_serious_date, value_source_value carries the ISO date so consumers can reconstruct the event without re-querying. Not using _bool_observation because value_source_value differs (date string, not "true"). @@ -403,8 +427,7 @@ def _ae_fk( AE-derived observation row. Returns (None, None) when the AE has no sequence_id or no published condition_occurrence row. Raises if the `cdm_field` static entry for condition_occurrence.condition_occurrence_id - is missing — this is required infrastructure for AE-attributed - observations. + is missing, this is required for AE-attributed observations. """ sequence_id = ae.sequence_id if sequence_id is None: From 5acf958fc73ec489273707e591fbd54b3aa82a4b Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Wed, 20 May 2026 15:11:52 +0200 Subject: [PATCH 19/23] refactor: clinical benefit from scalar to singleton domain model --- .../harmonization/harmonizers/impress.py | 49 +++++++++--- .../models/domain/clinical_benefit.py | 71 ++++++++++++++++ src/omop_etl/harmonization/models/patient.py | 47 +++++------ tests/harmonization/conftest.py | 10 ++- .../models/test_schema_validation.py | 2 + .../omop/builders/test_observation_builder.py | 80 ++++++++++++------- 6 files changed, 187 insertions(+), 72 deletions(-) create mode 100644 src/omop_etl/harmonization/models/domain/clinical_benefit.py diff --git a/src/omop_etl/harmonization/harmonizers/impress.py b/src/omop_etl/harmonization/harmonizers/impress.py index 15c2cc5..cc78fa6 100644 --- a/src/omop_etl/harmonization/harmonizers/impress.py +++ b/src/omop_etl/harmonization/harmonizers/impress.py @@ -8,6 +8,7 @@ from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.c30 import C30 +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.ecog_baseline import EcogBaseline from omop_etl.harmonization.models.domain.eq5d import EQ5D @@ -169,18 +170,20 @@ def _process_number_of_serious_adverse_events(self) -> pl.DataFrame | None: ) return sae_counts - @scalar() - def _process_has_clinical_benefit_at_week_16(self) -> pl.DataFrame | None: + @singleton(ClinicalBenefit) + def _process_clinical_benefit(self) -> pl.DataFrame: """ - Clinical benefit at W16 (visit 3). - Note: If patient has iRecist *and* Recist at same assessment, - iRecist evaluation takes precedence as it's a more specific assessment. + Clinical benefit at W16 at visit 3. + Priority for the answer and its date: iRecist (RA_RAiMODCD) > Recist + (RA_RATIMRESCD) > RNRSP_RNRSPCLCD. iRecist and Recist both date from + RA_EventDate, RNRSP uses RNRSP_EventDate. When no source registers a + benefit, the row is False and the date falls back to whichever V03 + date is available (coalesce RA_EventDate, RNRSP_EventDate). Collapsed + to one row per SubjectId. """ - colname = Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16 + cols = ClinicalBenefit.Fields timepoint = "V03" - # todo: consider emitting date of the benefit record - benefit = ( self.data.select( "SubjectId", @@ -195,15 +198,37 @@ def _process_has_clinical_benefit_at_week_16(self) -> pl.DataFrame | None: .filter(pl.any_horizontal(pl.all().exclude("SubjectId").is_not_null())) .filter((pl.col("RA_EventId") == timepoint) | (pl.col("RNRSP_EventId") == timepoint)) .with_columns( - pl.when(PolarsParsers.to_optional_int64(pl.col("RA_RATIMRESCD")).le(3)) + row_has_benefit=pl.when(PolarsParsers.to_optional_int64(pl.col("RA_RAiMODCD")).le(3)) .then(True) - .when(PolarsParsers.to_optional_int64(pl.col("RA_RAiMODCD")).le(3)) + .when(PolarsParsers.to_optional_int64(pl.col("RA_RATIMRESCD")).le(3)) .then(True) .when(PolarsParsers.to_optional_int64(pl.col("RNRSP_RNRSPCLCD")).le(3)) .then(True) - .otherwise(False) - .alias(colname) + .otherwise(False), + row_date=pl.when(PolarsParsers.to_optional_int64(pl.col("RA_RAiMODCD")).le(3)) + .then(PolarsParsers.to_optional_date("RA_EventDate")) + .when(PolarsParsers.to_optional_int64(pl.col("RA_RATIMRESCD")).le(3)) + .then(PolarsParsers.to_optional_date("RA_EventDate")) + .when(PolarsParsers.to_optional_int64(pl.col("RNRSP_RNRSPCLCD")).le(3)) + .then(PolarsParsers.to_optional_date("RNRSP_EventDate")) + .otherwise( + pl.coalesce( + PolarsParsers.to_optional_date("RA_EventDate"), + PolarsParsers.to_optional_date("RNRSP_EventDate"), + ) + ), + ) + .group_by("SubjectId") + .agg( + pl.col("row_has_benefit").any().alias(cols.HAS_BENEFIT), + pl.col("row_date").filter(pl.col("row_has_benefit")).first().alias("date_from_benefit"), + pl.col("row_date").first().alias("date_fallback"), + ) + .with_columns( + pl.coalesce("date_from_benefit", "date_fallback").alias(cols.DATE), + pl.lit(16, dtype=pl.Int64).alias(cols.WEEK), ) + .select("SubjectId", cols.WEEK, cols.HAS_BENEFIT, cols.DATE) ) return benefit diff --git a/src/omop_etl/harmonization/models/domain/clinical_benefit.py b/src/omop_etl/harmonization/models/domain/clinical_benefit.py new file mode 100644 index 0000000..be87f7d --- /dev/null +++ b/src/omop_etl/harmonization/models/domain/clinical_benefit.py @@ -0,0 +1,71 @@ +from typing import Set +import datetime as dt + +from omop_etl.harmonization.core.validators import StrictValidators +from omop_etl.harmonization.models.domain.base import DomainBase + + +class ClinicalBenefit(DomainBase): + """ + Clinical benefit assessment at a source-specific timepoint. Each patient + has at most one instace, the timepoint varies by trial (e.g. IMPRESS uses W16, + other sources may use W24). week is recorded explicitly so downstream + consumers can filter by timepoint without joining trial metadata. + """ + + class Fields: + WEEK = "week" + HAS_BENEFIT = "has_benefit" + DATE = "date" + + def __init__(self, patient_id: str): + self._patient_id = patient_id + self._week: int | None = None + self._has_benefit: bool | None = None + self._date: dt.date | None = None + self.updated_fields: Set[str] = set() + + NATURAL_KEY_FIELDS = (Fields.DATE,) + + @property + def patient_id(self) -> str: + return self._patient_id + + @property + def week(self) -> int | None: + return self._week + + @week.setter + def week(self, value: int | None) -> None: + self._set_validated_prop( + prop=self.__class__.week, + value=value, + validator=StrictValidators.validate_optional_int, + ) + + @property + def has_benefit(self) -> bool | None: + return self._has_benefit + + @has_benefit.setter + def has_benefit(self, value: bool | None) -> None: + self._set_validated_prop( + prop=self.__class__.has_benefit, + value=value, + validator=StrictValidators.validate_optional_bool, + ) + + @property + def date(self) -> dt.date | None: + return self._date + + @date.setter + def date(self, value: dt.date | None) -> None: + self._set_validated_prop( + prop=self.__class__.date, + value=value, + validator=StrictValidators.validate_optional_date, + ) + + def __repr__(self, delim=","): + return f"{self.__class__.__name__}(week={self.week!r}{delim}has_benefit={self.has_benefit!r}{delim} date={self.date!r})" diff --git a/src/omop_etl/harmonization/models/patient.py b/src/omop_etl/harmonization/models/patient.py index 9df124b..967a4e5 100644 --- a/src/omop_etl/harmonization/models/patient.py +++ b/src/omop_etl/harmonization/models/patient.py @@ -10,6 +10,7 @@ from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.c30 import C30 +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.ecog_baseline import EcogBaseline from omop_etl.harmonization.models.domain.eq5d import EQ5D @@ -45,8 +46,6 @@ class Scalars: HAS_ANY_ADVERSE_EVENTS = "has_any_adverse_events" NUMBER_OF_ADVERSE_EVENTS = "number_of_adverse_events" NUMBER_OF_SERIOUS_ADVERSE_EVENTS = "number_of_serious_adverse_events" - HAS_CLINICAL_BENEFIT_AT_WEEK_16 = "has_clinical_benefit_at_week_16" - CLINICAL_BENEFIT_AT_WEEK_16_DATE = "clinical_benefit_at_week_16_date" END_OF_TREATMENT_REASON = "end_of_treatment_reason" END_OF_TREATMENT_DATE = "end_of_treatment_date" @@ -54,6 +53,7 @@ class Singletons: TUMOR_TYPE = "tumor_type" STUDY_DRUGS = "study_drugs" BIOMARKERS = "biomarkers" + CLINICAL_BENEFIT = "clinical_benefit" LOST_TO_FOLLOWUP = "lost_to_followup" ECOG_BASELINE = "ecog_baseline" TUMOR_ASSESSMENT_BASELINE = "tumor_assessment_baseline" @@ -86,8 +86,6 @@ def __init__(self, patient_id: str, trial_id: str): self._has_any_adverse_events: bool | None = None self._number_of_adverse_events: int | None = None self._number_of_serious_adverse_events: int | None = None - self._has_clinical_benefit_at_week_16: bool | None = None - self._clinical_benefit_at_week_16_date: dt.date | None = None self._end_of_treatment_reason: str | None = None self._end_of_treatment_date: dt.date | None = None @@ -95,6 +93,7 @@ def __init__(self, patient_id: str, trial_id: str): self._tumor_type: TumorType | None = None self._study_drugs: StudyDrugs | None = None self._biomarkers: Biomarkers | None = None + self._clinical_benefit: ClinicalBenefit | None = None self._lost_to_followup: FollowUp | None = None self._ecog_baseline: EcogBaseline | None = None self._tumor_assessment_baseline: TumorAssessmentBaseline | None = None @@ -267,30 +266,6 @@ def number_of_serious_adverse_events(self, value: int | None) -> None: validator=StrictValidators.validate_optional_int, ) - @property - def has_clinical_benefit_at_week_16(self) -> bool | None: - return self._has_clinical_benefit_at_week_16 - - @has_clinical_benefit_at_week_16.setter - def has_clinical_benefit_at_week_16(self, value: bool | None) -> None: - self._set_validated_prop( - prop=self.__class__.has_clinical_benefit_at_week_16, - value=value, - validator=StrictValidators.validate_optional_bool, - ) - - @property - def clinical_benefit_at_week_16_date(self) -> dt.date | None: - return self._clinical_benefit_at_week_16_date - - @clinical_benefit_at_week_16_date.setter - def clinical_benefit_at_week_16_date(self, value: dt.date | None) -> None: - self._set_validated_prop( - prop=self.__class__.clinical_benefit_at_week_16_date, - value=value, - validator=StrictValidators.validate_optional_date, - ) - @property def end_of_treatment_reason(self) -> str | None: return self._end_of_treatment_reason @@ -358,6 +333,20 @@ def biomarkers(self, value: Biomarkers | None) -> None: ) self.updated_fields.add(Biomarkers.__name__) + @property + def clinical_benefit(self) -> ClinicalBenefit | None: + return self._clinical_benefit + + @clinical_benefit.setter + def clinical_benefit(self, value: ClinicalBenefit | None) -> None: + self._clinical_benefit = self.validate_singleton( + value, + item_type=ClinicalBenefit, + patient_id=self._patient_id, + field_name=setter_name(self.__class__.clinical_benefit), + ) + self.updated_fields.add(ClinicalBenefit.__name__) + @property def lost_to_followup(self) -> FollowUp | None: return self._lost_to_followup @@ -670,7 +659,6 @@ def __repr__(self): f"number_of_serious_adverse_events={self.number_of_serious_adverse_events}{delim} " f"evaluable_for_efficacy_analysis={self.evaluable_for_efficacy_analysis}{delim} " f"treatment_start_date={self.treatment_start_date}{delim} " - f"has_clinical_benefit_at_week16={self.has_clinical_benefit_at_week_16}{delim} " f"end_of_treatment_reason={self.end_of_treatment_reason}{delim} " f"end_of_treatment_date={self.end_of_treatment_date}{delim} " # singletons @@ -678,6 +666,7 @@ def __repr__(self): f"tumor_assessment_baseline={self.tumor_assessment_baseline}{delim} " f"biomarkers={self.biomarkers}{delim} " f"ecog={self.ecog_baseline}{delim} " + f"clinical_benefit={self.clinical_benefit}{delim} " f"lost_to_followup={self.lost_to_followup}{delim} " f"best_overall_response={self.best_overall_response}{delim} " # collections diff --git a/tests/harmonization/conftest.py b/tests/harmonization/conftest.py index 946da2d..d8308a0 100644 --- a/tests/harmonization/conftest.py +++ b/tests/harmonization/conftest.py @@ -1706,42 +1706,50 @@ class ClinicalBenefitRow: RNRSP_RNRSPCLCD: int | None = None RNRSP_EventId: str | None = None RA_EventId: str | None = None + RA_EventDate: str | None = None + RNRSP_EventDate: str | None = None @pytest.fixture -def has_clinical_benefit_at_week_16_fixture() -> pl.DataFrame: +def clinical_benefit_fixture() -> pl.DataFrame: rows: List[ClinicalBenefitRow] = [ ClinicalBenefitRow( "recist_le3", RA_RATIMRESCD=3, RA_EventId="V03", + RA_EventDate="2023-04-01", ), ClinicalBenefitRow( "recist_gt3", RA_RATIMRESCD=4, RA_EventId="V03", + RA_EventDate="2023-04-02", ), ClinicalBenefitRow( "irecist_le3", RA_RAiMODCD=2, RA_EventId="V03", + RA_EventDate="2023-04-03", ), ClinicalBenefitRow( "rano_le3", RNRSP_RNRSPCLCD=3, RNRSP_EventId="V03", + RNRSP_EventDate="2023-04-04", ), ClinicalBenefitRow( "both_present", RA_RATIMRESCD=4, RA_RAiMODCD=3, RA_EventId="V03", + RA_EventDate="2023-04-05", ), ClinicalBenefitRow("v03_no_codes", RA_EventId="V03"), ClinicalBenefitRow( "not_v03", RA_RATIMRESCD=2, RA_EventId="V02", + RA_EventDate="2023-04-06", ), ] diff --git a/tests/harmonization/models/test_schema_validation.py b/tests/harmonization/models/test_schema_validation.py index 5567c04..92f90ed 100644 --- a/tests/harmonization/models/test_schema_validation.py +++ b/tests/harmonization/models/test_schema_validation.py @@ -9,6 +9,7 @@ from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.c30 import C30 +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.ecog_baseline import EcogBaseline from omop_etl.harmonization.models.domain.eq5d import EQ5D @@ -71,6 +72,7 @@ def test_no_extra_constants(self): BestOverallResponse, Biomarkers, C30, + ClinicalBenefit, ConcomitantMedication, EcogBaseline, EQ5D, diff --git a/tests/omop/builders/test_observation_builder.py b/tests/omop/builders/test_observation_builder.py index 0bf1ac8..21ccdfc 100644 --- a/tests/omop/builders/test_observation_builder.py +++ b/tests/omop/builders/test_observation_builder.py @@ -1,6 +1,5 @@ import datetime as dt import logging - import pytest from omop_etl.concept_mapping.service import ConceptLookupService @@ -34,7 +33,7 @@ def _with_yes_no(structural_index: dict) -> dict: def _with_cdm_field(static_index: dict) -> dict: - """`cdm_field` static entry for AE → condition_occurrence FK linkage.""" + """cdm_field static entry for AE: condition_occurrence FK linkage.""" static_index[("cdm_field", "condition_occurrence.condition_occurrence_id")] = _static( "cdm_field", "condition_occurrence.condition_occurrence_id", @@ -64,8 +63,10 @@ def test_empty_patient_returns_empty(self, static_index, structural_index): class TestEvaluableForEfficacy: - """Shape 1 (unmapped source attribute): concept_id=0, source_value=field - name, value_source_value=lowercase literal.""" + """ + Pattern 1 (unmapped source attribute): + concept_id=0, source_value=field name, value_source_value=lowercase literal. + """ def test_true_emits_row_with_yes_value(self, static_index, structural_index): _with_yes_no(structural_index) @@ -139,8 +140,10 @@ def test_skipped_when_treatment_start_date_missing(self, static_index, structura class TestClinicalBenefit: - """Shape 1, like evaluable. Prefers `clinical_benefit_at_week_16_date` - scalar; falls back to treatment_start + 16w.""" + """ + Pattern 1 (e.g. evaluable): + Prefers clinical_benefit_at_week_16_date scalar, falls back to treatment_start + 16w. + """ def test_uses_clinical_benefit_date_scalar_when_set(self, static_index, structural_index): _with_yes_no(structural_index) @@ -198,9 +201,11 @@ def test_skipped_when_no_date_available(self, static_index, structural_index, ca class TestEndOfTreatmentReason: - """Shape 1: concept_id=0, field name in source_value, mapped reason concept + """ + Pattern 1: concept_id=0, field name in source_value, mapped reason concept in value_as_concept_id (or 0 if unmapped), raw reason preserved in both - value_as_string and value_source_value.""" + value_as_string and value_source_value. + """ def test_mapped_reason_emits_row(self, static_index, structural_index): static_index[("eot_reason", "disease progression")] = _static("eot_reason", "disease progression", 1617595, "observation") @@ -225,8 +230,10 @@ def test_mapped_reason_emits_row(self, static_index, structural_index): assert row.value_source_value == "Disease progression" def test_unmapped_reason_emits_row_with_value_concept_zero(self, static_index, structural_index): - """No static mapping → row still emits, value_as_concept_id=0, raw - reason preserved in value_as_string + value_source_value.""" + """ + No static mapping: row still emits, value_as_concept_id=0, raw + reason preserved in value_as_string and value_source_value. + """ concepts = ConceptLookupService(static_index, structural_index) patient = create_patient( PID, @@ -257,8 +264,10 @@ def test_skipped_without_eot_date(self, static_index, structural_index, caplog): class TestLostToFollowup: - """Shape 2 (mapped topic): concept_id=Lost-to-follow-up, - source_value=field name, value_source_value=lowercase literal.""" + """ + Pattern 2 (mapped topic): concept_id=Lost-to-follow-up, + source_value=field name, value_source_value=lowercase literal. + """ def test_lost_to_followup_true_emits_row(self, static_index, structural_index): _with_yes_no(structural_index) @@ -321,12 +330,12 @@ def test_missing_date_skips(self, static_index, structural_index, caplog): class TestAdverseEventOutcome: - """Shape 2/3: topic concept (structural `adverse_event_outcome`) + - answer concept (static `adverse_event_outcome,`). Either lookup + """Pattern 2 and 3: topic concept (structural adverse_event_outcome) and + answer concept (static adverse_event_outcome,). Either lookup can miss and the row still emits with concept_id=0 fallback, as long as outcome and start_date are present. FK-linked.""" - def _make_patient(self, outcome: str | None) -> Patient: + def _make_patient(self, outcome: str | None) -> Patient: # noqa patient = create_patient(PID, TRIAL) ae = AdverseEvent(patient_id=PID) ae.term = "Fever" @@ -359,8 +368,10 @@ def test_mapped_outcome_emits_row(self, static_index, structural_index): assert row.obs_event_field_concept_id == CDM_FIELD_CID def test_topic_structural_missing_falls_back_to_zero(self, static_index, structural_index): - """No topic structural → concept_id=0 but row still emits with mapped - value and raw outcome preserved in value_source_value.""" + """ + No topic structural: concept_id=0 but row still emits with mapped + value and raw outcome preserved in value_source_value. + """ _with_cdm_field(static_index) static_index[("adverse_event_outcome", "recovering/resolving")] = _static("adverse_event_outcome", "recovering/resolving", 1074213, "observation") concepts = ConceptLookupService(static_index, structural_index) @@ -377,8 +388,10 @@ def test_topic_structural_missing_falls_back_to_zero(self, static_index, structu assert row.value_source_value == "Recovering/resolving" def test_value_static_missing_falls_back_to_zero(self, static_index, structural_index): - """No static mapping for the outcome text → value_as_concept_id=0, - row still emits with topic concept and raw outcome preserved.""" + """ + No static mapping for the outcome text: value_as_concept_id=0, + row still emits with topic concept and raw outcome preserved. + """ _with_ae_outcome_topic(structural_index) _with_cdm_field(static_index) concepts = ConceptLookupService(static_index, structural_index) @@ -395,8 +408,10 @@ def test_value_static_missing_falls_back_to_zero(self, static_index, structural_ assert row.value_source_value == "Some unmapped outcome" def test_both_lookups_missing_emits_zero_row_with_raw_value(self, static_index, structural_index): - """Worst case: no mappings at all, row still emits with both - concept ids 0 and value_source_value preserving the raw text.""" + """ + Worst case: no mappings at all, row still emits with both + concept ids 0 and value_source_value preserving the raw text. + """ _with_cdm_field(static_index) concepts = ConceptLookupService(static_index, structural_index) patient = self._make_patient("Some outcome") @@ -422,10 +437,12 @@ def test_outcome_none_emits_nothing(self, static_index, structural_index): class TestAdverseEventWasSerious: - """Shape 3: concept_id=0 + FK linkage. Emits for both True and False - (records the assessment either way).""" + """ + Pattern 3: concept_id=0 and FK linkage. Emits for both True and False + (records the assessment either way). + """ - def _make_patient(self, was_serious: bool | None) -> Patient: + def _make_patient(self, was_serious: bool | None) -> Patient: # noqa patient = create_patient(PID, TRIAL) ae = AdverseEvent(patient_id=PID) ae.term = "Fever" @@ -479,8 +496,10 @@ def test_was_serious_none_emits_nothing(self, static_index, structural_index): assert rows == [] def test_no_fk_when_no_condition_row_published(self, static_index, structural_index, caplog): - """AE with sequence_id but no published condition_occurrence row: - observation still emits, FK fields left blank, warning logged.""" + """ + AE with sequence_id but no published condition_occurrence row: + observation still emits, FK fields left blank, warning logged. + """ _with_yes_no(structural_index) _with_cdm_field(static_index) concepts = ConceptLookupService(static_index, structural_index) @@ -517,8 +536,9 @@ def test_no_fk_when_ae_missing_sequence_id(self, static_index, structural_index, assert any("missing sequence_id" in rec.message for rec in caplog.records) def test_raises_when_cdm_field_missing_but_fk_resolvable(self, static_index, structural_index): - """cdm_field is required infrastructure: builder raises rather than - emit a partially-linked row when the static entry is missing.""" + """ + cdm_field is required: builder raises. + """ _with_yes_no(structural_index) concepts = ConceptLookupService(static_index, structural_index) patient = self._make_patient(True) @@ -612,8 +632,8 @@ def test_multi_source_uniqueness_and_determinism(self, static_index, structural_ rows_a = ObservationBuilder(concepts).build(ctx_a) rows_b = ObservationBuilder(concepts).build(ctx_b) - # 4 scalars/singleton (evaluable + clinical_benefit + eot + lost_to_followup) - # + 3 AE-derived (outcome, was_serious, turned_serious) = 7 rows + # 4 scalars/singleton (evaluable, clinical_benefit, eot, lost_to_followup) + # and 3 AE-derived (outcome, was_serious, turned_serious) = 7 rows assert len(rows_a) == 7 ids = [r.observation_id for r in rows_a] assert len(ids) == len(set(ids)), "All observation_ids must be unique" From 69e2396bf4de52c680de87c60f8395736d7dda24 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Wed, 20 May 2026 15:21:00 +0200 Subject: [PATCH 20/23] feat: using ClinicalBenefit model in Observation builder --- src/omop_etl/omop/builders/observation.py | 48 +++++---- .../harmonization/harmonizers/test_impress.py | 50 ++++++---- .../omop/builders/test_observation_builder.py | 97 ++++++++++++++----- 3 files changed, 132 insertions(+), 63 deletions(-) diff --git a/src/omop_etl/omop/builders/observation.py b/src/omop_etl/omop/builders/observation.py index 6a39d88..240e7ac 100644 --- a/src/omop_etl/omop/builders/observation.py +++ b/src/omop_etl/omop/builders/observation.py @@ -44,7 +44,6 @@ class ObservationBuilder(OmopBuilder[ObservationRow]): """ table_name: ClassVar[str] = "observation" - week_16: ClassVar[dt.timedelta] = dt.timedelta(weeks=16) def build(self, ctx: BuildContext) -> list[ObservationRow]: """ @@ -161,35 +160,44 @@ def _build_clinical_benefit( observation_type_concept_id: int, ) -> list[ObservationRow]: """ - Clinical benefit at W16. Uses the dedicated - `clinical_benefit_at_week_16_date` scalar if set, otherwise falls back - to `treatment_start_date + 16 weeks`. - todo: Switch to a ClinicalBenefit singleton, fallback to w16 date if missing date from singleton + Clinical benefit at a source-specific timepoint. Read from the + ClinicalBenefit singleton, date is authoritative (no fallback). + observation_source_value encodes the week + (e.g. "has_clinical_benefit_at_week_16") so downstream queries + can filter by timepoint. """ - value = patient.has_clinical_benefit_at_week_16 - if value is None: + cb = patient.clinical_benefit + if cb is None: + return [] + has_benefit = cb.has_benefit + date = cb.date + week = cb.week + if has_benefit is None: return [] - - date = patient.clinical_benefit_at_week_16_date if date is None: - start = patient.treatment_start_date - if start is None: - log.warning( - "Skipping has_clinical_benefit_at_week_16 for %s: no clinical_benefit_at_week_16_date and no treatment_start_date", - patient.patient_id, - ) - return [] - date = start + self.week_16 + log.warning( + "Skipping clinical_benefit for %s: ClinicalBenefit singleton has no date", + patient.patient_id, + ) + return [] + if week is None: + log.warning( + "Skipping clinical_benefit for %s: ClinicalBenefit singleton has no week", + patient.patient_id, + ) + return [] + field_name = f"has_clinical_benefit_at_week_{week}" return [ self._bool_observation( observation_id=self.generate_row_id( patient.patient_id, - Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16, + Patient.Singletons.CLINICAL_BENEFIT, + *cb.natural_key(), ), person_id=person_id, - field_name=Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16, - value=value, + field_name=field_name, + value=has_benefit, date=date, observation_type_concept_id=observation_type_concept_id, ) diff --git a/tests/harmonization/harmonizers/test_impress.py b/tests/harmonization/harmonizers/test_impress.py index c14fd85..39fb88f 100644 --- a/tests/harmonization/harmonizers/test_impress.py +++ b/tests/harmonization/harmonizers/test_impress.py @@ -1581,35 +1581,45 @@ def test_irecist_ne_maps_to_96(self, best_overall_response_fixture): assert row.item(0, "date") == dt.date(1900, 4, 1) -class TestProcessClinicalBenefitAtWeek16: - def test_returns_expected_columns(self, has_clinical_benefit_at_week_16_fixture): - h = ImpressHarmonizer(data=has_clinical_benefit_at_week_16_fixture, trial_id="T") - df = h._process_has_clinical_benefit_at_week_16() +class TestProcessClinicalBenefit: + def test_returns_expected_columns(self, clinical_benefit_fixture): + h = ImpressHarmonizer(data=clinical_benefit_fixture, trial_id="T") + df = h._process_clinical_benefit() assert df is not None - assert "SubjectId" in df.columns - assert "has_clinical_benefit_at_week_16" in df.columns + assert df.columns == ["SubjectId", "week", "has_benefit", "date"] + + def test_week_is_16_for_impress(self, clinical_benefit_fixture): + h = ImpressHarmonizer(data=clinical_benefit_fixture, trial_id="T") + df = h._process_clinical_benefit() + assert df is not None + assert df["week"].unique().to_list() == [16] @pytest.mark.parametrize( - "sid, expected", + "sid, expected_benefit, expected_date", [ - pytest.param("recist_le3", True, id="single criterion: RECIST <=3"), - pytest.param("recist_gt3", False, id="single criterion: RECIST >3"), - pytest.param("irecist_le3", True, id="single criterion: iRECIST <=3"), - pytest.param("rano_le3", True, id="single criterion: RANO <=3"), - pytest.param("both_present", True, id="multi criterion present"), - pytest.param("v03_no_codes", False, id="V03 visit but no benefit codes"), - pytest.param("not_v03", None, id="non-V03 visit -> filtered out"), + pytest.param("recist_le3", True, dt.date(2023, 4, 1), id="RECIST <=3: RA_EventDate"), + pytest.param("recist_gt3", False, dt.date(2023, 4, 2), id="RECIST >3: fallback to RA_EventDate"), + pytest.param("irecist_le3", True, dt.date(2023, 4, 3), id="iRECIST <=3: RA_EventDate"), + pytest.param("rano_le3", True, dt.date(2023, 4, 4), id="RANO <=3: RNRSP_EventDate"), + pytest.param("both_present", True, dt.date(2023, 4, 5), id="multi criterion present: RA_EventDate"), + pytest.param("v03_no_codes", False, None, id="V03 visit but no benefit codes, no dates"), + pytest.param("not_v03", None, None, id="non-V03 visit: filtered out"), ], ) - def test_clinical_benefit__at_week_16_values(self, has_clinical_benefit_at_week_16_fixture, sid, expected): - h = ImpressHarmonizer(data=has_clinical_benefit_at_week_16_fixture, trial_id="T") - df = h._process_has_clinical_benefit_at_week_16() + def test_clinical_benefit_values_and_dates(self, clinical_benefit_fixture, sid, expected_benefit, expected_date): + h = ImpressHarmonizer(data=clinical_benefit_fixture, trial_id="T") + df = h._process_clinical_benefit() assert df is not None row = df.filter(pl.col("SubjectId") == sid) - actual = None if row.height == 0 else row.item(0, "has_clinical_benefit_at_week_16") - assert actual is expected + if expected_benefit is None: + assert row.height == 0 + return + actual_benefit = row.item(0, "has_benefit") + actual_date = row.item(0, "date") + assert actual_benefit is expected_benefit + assert actual_date == expected_date class TestProcessEotReason: @@ -1676,13 +1686,13 @@ class TestImpressSpecContracts: "treatment_start_last_cycle": "last_treatment_start_fixture", "treatment_start_date": "treatment_start_fixture", "evaluable_for_efficacy_analysis": "evaluability_fixture", - "has_clinical_benefit_at_week_16": "has_clinical_benefit_at_week_16_fixture", "end_of_treatment_reason": "end_of_treatment_reason_fixture", "end_of_treatment_date": "treatment_stop_fixture", # singletons "tumor_type": "tumor_type_fixture", "study_drugs": "study_drugs_fixture", "biomarkers": "biomarkers_fixture", + "clinical_benefit": "clinical_benefit_fixture", "lost_to_followup": "lost_to_followup_fixture", "ecog_baseline": "ecog_fixture", "baseline_tumor_assessment": "baseline_tumor_assessment_fixture", diff --git a/tests/omop/builders/test_observation_builder.py b/tests/omop/builders/test_observation_builder.py index 21ccdfc..47b9e41 100644 --- a/tests/omop/builders/test_observation_builder.py +++ b/tests/omop/builders/test_observation_builder.py @@ -4,6 +4,7 @@ from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.patient import Patient from omop_etl.omop.builders.observation import ObservationBuilder @@ -141,20 +142,29 @@ def test_skipped_when_treatment_start_date_missing(self, static_index, structura class TestClinicalBenefit: """ - Pattern 1 (e.g. evaluable): - Prefers clinical_benefit_at_week_16_date scalar, falls back to treatment_start + 16w. + Pattern 1 (no topic concept): concept_id=0, observation_source_value + encodes the week as `has_clinical_benefit_at_week_`, value_as_concept_id + is Yes/No concepts, observation_date is the singleton's date. """ - def test_uses_clinical_benefit_date_scalar_when_set(self, static_index, structural_index): + def _make_singleton( # noqa + self, + *, + has_benefit: bool | None, + week: int | None = 16, + date: dt.date | None = dt.date(2023, 4, 20), + ) -> ClinicalBenefit: + cb = ClinicalBenefit(patient_id=PID) + cb.week = week + cb.has_benefit = has_benefit + cb.date = date + return cb + + def test_emits_row_with_yes_for_true(self, static_index, structural_index): _with_yes_no(structural_index) concepts = ConceptLookupService(static_index, structural_index) - patient = create_patient( - PID, - TRIAL, - has_clinical_benefit_at_week_16=True, - clinical_benefit_at_week_16_date=dt.date(2023, 4, 20), - treatment_start_date=dt.date(2023, 1, 1), - ) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=True) rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) @@ -163,41 +173,77 @@ def test_uses_clinical_benefit_date_scalar_when_set(self, static_index, structur assert row.observation_concept_id == 0 assert row.observation_date == dt.date(2023, 4, 20) assert row.observation_source_value == "has_clinical_benefit_at_week_16" + assert row.observation_source_concept_id == 0 assert row.value_as_concept_id == YES_CID assert row.value_source_value == "true" - def test_falls_back_to_treatment_start_plus_16_weeks(self, static_index, structural_index): + def test_emits_row_with_no_for_false(self, static_index, structural_index): _with_yes_no(structural_index) concepts = ConceptLookupService(static_index, structural_index) - patient = create_patient( - PID, - TRIAL, - has_clinical_benefit_at_week_16=True, - treatment_start_date=dt.date(2023, 1, 1), + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=False) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].value_source_value == "false" + + def test_source_value_encodes_week_for_other_timepoints(self, static_index, structural_index): + """Week 24 from another source produces: `has_clinical_benefit_at_week_24`.""" + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton( + has_benefit=True, + week=24, + date=dt.date(2023, 6, 1), ) rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) assert len(rows) == 1 - assert rows[0].observation_date == dt.date(2023, 1, 1) + dt.timedelta(weeks=16) + assert rows[0].observation_source_value == "has_clinical_benefit_at_week_24" + assert rows[0].observation_date == dt.date(2023, 6, 1) - def test_skipped_when_value_is_none(self, static_index, structural_index): + def test_singleton_absent_returns_empty(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_skipped_when_has_benefit_is_none(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) - patient = create_patient(PID, TRIAL, treatment_start_date=dt.date(2023, 1, 1)) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=None) rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) assert rows == [] - def test_skipped_when_no_date_available(self, static_index, structural_index, caplog): + def test_skipped_when_date_is_none(self, static_index, structural_index, caplog): concepts = ConceptLookupService(static_index, structural_index) - patient = create_patient(PID, TRIAL, has_clinical_benefit_at_week_16=True) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=True, date=None) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("no date" in rec.message for rec in caplog.records) + + def test_skipped_when_week_is_none(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=True, week=None) with caplog.at_level(logging.WARNING): rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) assert rows == [] - assert any("clinical_benefit_at_week_16_date" in rec.message for rec in caplog.records) + assert any("no week" in rec.message for rec in caplog.records) class TestEndOfTreatmentReason: @@ -605,11 +651,16 @@ def test_multi_source_uniqueness_and_determinism(self, static_index, structural_ TRIAL, treatment_start_date=dt.date(2023, 1, 10), evaluable_for_efficacy_analysis=True, - has_clinical_benefit_at_week_16=False, end_of_treatment_reason="Other", end_of_treatment_date=dt.date(2023, 8, 1), ) + cb = ClinicalBenefit(patient_id=PID) + cb.week = 16 + cb.has_benefit = False + cb.date = dt.date(2023, 4, 25) + patient.clinical_benefit = cb + followup = FollowUp(patient_id=PID) followup.lost_to_followup = True followup.date_lost_to_followup = dt.date(2023, 9, 1) From d5dd1506928d1cc7fa27dfa925b033c203005f5b Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Thu, 21 May 2026 11:56:48 +0200 Subject: [PATCH 21/23] feat: ConditionOccurrence publishes FK so measurement (cancer modifier records) can link back to primary cancer entry for that patient, as per cdm oncologoy guidelines --- src/omop_etl/omop/builders/base.py | 1 + .../omop/builders/condition_occurrence.py | 18 +- src/omop_etl/omop/builders/measurement.py | 41 ++++ .../semantic_mapped/braf_non-v600_mapped.csv | 2 +- .../static_mapped/structural_mapping.csv | 3 +- .../builders/test_condition_occurrence.py | 80 ++++++++ tests/omop/builders/test_measurement.py | 186 ++++++++++++++++++ .../omop/builders/test_observation_builder.py | 4 +- 8 files changed, 328 insertions(+), 7 deletions(-) diff --git a/src/omop_etl/omop/builders/base.py b/src/omop_etl/omop/builders/base.py index f1c1594..9c158a8 100644 --- a/src/omop_etl/omop/builders/base.py +++ b/src/omop_etl/omop/builders/base.py @@ -21,6 +21,7 @@ class BuildContext: person_id: int visit_id_by_date: dict[dt.date, int] = field(default_factory=dict) condition_id_by_ae_sequence_id: dict[int, int] = field(default_factory=dict) + condition_id_primary_cancer: int | None = None class OmopBuilder(ABC, Generic[T]): diff --git a/src/omop_etl/omop/builders/condition_occurrence.py b/src/omop_etl/omop/builders/condition_occurrence.py index 8d259e2..988b352 100644 --- a/src/omop_etl/omop/builders/condition_occurrence.py +++ b/src/omop_etl/omop/builders/condition_occurrence.py @@ -26,9 +26,11 @@ class ConditionOccurrenceBuilder(OmopBuilder[ConditionOccurrenceRow]): def __init__(self, concepts: ConceptLookupService): super().__init__(concepts) self._ae_to_condition_id: dict[int, int] = {} + self._primary_cancer_condition_id: int | None = None def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: self._ae_to_condition_id = {} + self._primary_cancer_condition_id = None patient = ctx.patient person_id = ctx.person_id rows: list[ConditionOccurrenceRow] = [] @@ -36,7 +38,12 @@ def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: condition_type_concept_id = int(ecrf.concept_id) if ecrf else 0 if patient.tumor_type is not None: - rows.extend(self._build_tumor_type_rows(patient, person_id, patient.tumor_type, condition_type_concept_id)) + tumor_rows = self._build_tumor_type_rows(patient, person_id, patient.tumor_type, condition_type_concept_id) + if tumor_rows: + # multi-concept tumor mappings produce multiple rows: pick the + # first deterministically (collection already sorted by NK). + self._primary_cancer_condition_id = tumor_rows[0].condition_occurrence_id + rows.extend(tumor_rows) for idx, mh in enumerate(patient.medical_histories): rows.extend(self._build_medical_history_rows(patient, person_id, mh, idx, condition_type_concept_id)) @@ -48,10 +55,15 @@ def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: def populate_context(self, rows: list[ConditionOccurrenceRow], ctx: BuildContext) -> None: """ - Publish AE.sequence_id to condition_occurrence_id from accumulated mapping from build, - so other builders (e.g. Observation) can set field concept/event ids from AEs (e.g. was_serious, turned_serious_data). + Publish two pieces of cross-builder state: + - condition_id_by_ae_sequence_id: AE.sequence_id: condition_occurrence_id, + for ObservationBuilder's was_serious / turned_serious_date FK linkage. + - condition_id_primary_cancer: condition_occurrence_id of the tumor-type + row, for MeasurementBuilder's measurement_event_id linkage on lesion-size + and biomarker rows (per oncology CDM guideline). """ ctx.condition_id_by_ae_sequence_id.update(self._ae_to_condition_id) + ctx.condition_id_primary_cancer = self._primary_cancer_condition_id def _build_tumor_type_rows( self, diff --git a/src/omop_etl/omop/builders/measurement.py b/src/omop_etl/omop/builders/measurement.py index 39b7f88..5ac842d 100644 --- a/src/omop_etl/omop/builders/measurement.py +++ b/src/omop_etl/omop/builders/measurement.py @@ -35,10 +35,35 @@ class MeasurementBuilder(OmopBuilder[MeasurementRow]): - value_as_number: is the numeric result where the source provides one. - visit_occurrence_id: is linked by date via ctx.visit_id_by_date (populated by the visit_occurrence builder, which must run before this). + - measurement_event_id: linked to cancer condtion from ctx.condition_id_primary_cancer + which is populated by the ConditionOccurrence builder (must run before this builder). + - meas_event_field_concept_id: links to condition occurrence field concept for the + measurement_event_id FK. """ table_name: ClassVar[str] = "measurement" + def _primary_cancer_fk(self, ctx: BuildContext) -> tuple[int | None, int | None]: + """ + Resolve (measurement_event_id, meas_event_field_concept_id) for + linking a measurement back to the patient's primary cancer + condition_occurrence row, per oncology CDM guideline. + Returns (None, None) when no primary cancer condition has been + published, raises if the cdm_field static entry is missing + (required once a primary cancer is published). + """ + event_id = ctx.condition_id_primary_cancer + if event_id is None: + return None, None + field_concept = self.concepts.lookup_static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + domains={"Metadata"}, + ) + if field_concept is None: + raise RuntimeError("Missing cdm_field mapping for condition_occurrence.condition_occurrence_id") + return event_id, field_concept.concept_id + def build(self, ctx: BuildContext) -> list[MeasurementRow]: patient = ctx.patient person_id = ctx.person_id @@ -249,6 +274,7 @@ def _build_biomarker_rows( datetime_value = dt.datetime(date.year, date.month, date.day) visit_occurrence_id = ctx.visit_id_by_date.get(date) + event_id, field_concept_id = self._primary_cancer_fk(ctx) return [ MeasurementRow( measurement_id=self.generate_row_id( @@ -265,6 +291,8 @@ def _build_biomarker_rows( measurement_type_concept_id=ecrf_concept, visit_occurrence_id=visit_occurrence_id, measurement_source_value=source_value[:50], + measurement_event_id=event_id, + meas_event_field_concept_id=field_concept_id, ) for concept in matches ] @@ -301,6 +329,10 @@ def _build_tumor_assessment_baseline_rows( log.warning("No lesion_size structural concept for %s", patient.patient_id) return [] + unit = self.concepts.lookup_structural("millimeter", domains={"Unit"}) + unit_concept_id = unit.concept_id if unit else None + event_id, field_concept_id = self._primary_cancer_fk(ctx) + row_id = self.generate_row_id( patient.patient_id, Patient.Singletons.TUMOR_ASSESSMENT_BASELINE, @@ -316,8 +348,11 @@ def _build_tumor_assessment_baseline_rows( measurement_datetime=dt.datetime(date.year, date.month, date.day), measurement_type_concept_id=ecrf_concept, value_as_number=float(size), + unit_concept_id=unit_concept_id, visit_occurrence_id=ctx.visit_id_by_date.get(date), measurement_source_value=str(size)[0:50], + measurement_event_id=event_id, + meas_event_field_concept_id=field_concept_id, ) ] @@ -361,6 +396,9 @@ def _build_tumor_assessment_rows( if size is not None: lesion = self.concepts.lookup_structural("lesion_size", domains={OmopDomain.MEASUREMENTS}) if lesion is not None: + unit = self.concepts.lookup_structural("millimeter", domains={"Unit"}) + unit_concept_id = unit.concept_id if unit else None + event_id, field_concept_id = self._primary_cancer_fk(ctx) rows.append( MeasurementRow( measurement_id=self.generate_row_id( @@ -375,8 +413,11 @@ def _build_tumor_assessment_rows( measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, value_as_number=size, + unit_concept_id=unit_concept_id, visit_occurrence_id=visit_occurrence_id, measurement_source_value=str(size)[:50], + measurement_event_id=event_id, + meas_event_field_concept_id=field_concept_id, ) ) diff --git a/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv b/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv index cc24ec3..cc45b7e 100644 --- a/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv +++ b/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv @@ -170,7 +170,7 @@ c0d469b8-2d83-5ef8-a060-3cb5b0f2eef2,CM_CMTRT,Ibuprofen,1,1177480,5640,ibuprofen 98cea1ff-2517-56c3-af61-bb37270630e4,CM_CMTRT,Kaliumklorid,1,19049105,8591,potassium chloride,Ingredient,Standard,Valid,Drug,RxNorm Extension 4000804f-b2bb-52b8-859e-8747b20d8124,CM_CMTRT,Fiasp (Insulin),1,1567198,51428,"insulin aspart, human",Ingredient,Standard,Valid,Drug,RxNorm fc4de2f6-c822-542e-8392-53cf03842961,CM_CMTRT,Cimetidin,1,997276,2541,cimetidine,Ingredient,Standard,Valid,Drug,RxNorm -9261aa9a-b652-5d50-b109-b5c92b058ae2,COH_COHCTN,BRAF Non-V600 activating mutations,12,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC +9261aa9a-b652-5d50-b109-b5c92b058ae2,COH_COHCTN,c,12,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC a683d456-51bb-52f1-8dcb-b88cfe79add6,COH_COHCTN,BRAF Non-V600activating mutations,1,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC 8d9cba99-2d8f-5254-8226-cc57d315adc7,COH_COHTMN,BRAF Non-V600 activating mutations,13,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC 4bf9b6d6-5f7e-5c8e-b424-16b4885158df,COH_GENMUT1,BRAF activating mutations,13,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC diff --git a/src/omop_etl/resources/static_mapped/structural_mapping.csv b/src/omop_etl/resources/static_mapped/structural_mapping.csv index e44d70b..1378cdd 100644 --- a/src/omop_etl/resources/static_mapped/structural_mapping.csv +++ b/src/omop_etl/resources/static_mapped/structural_mapping.csv @@ -3,7 +3,8 @@ no,4188540,373067005,No,Qualifier Value,Standard,Valid,Meas Value,SNOMED yes,4188539,373066001,Yes,Qualifier Value,Standard,Valid,Meas Value,SNOMED adverse_event_outcome,4231813,405533003,Adverse incident outcome,Clinical Finding,Standard,Valid,Observation,SNOMED ecog,36305384,89247-1,ECOG Performance Status score,Clinical Observation,Standard,Valid,Measurement,LOINC -lesion_size,4084390,246116008,Lesion size,Observable Entity,Standard,Valid,Measurement,SNOMED +lesion_size,36768664,OMOP4998340,Dimension of Tumor,Dimension,Standard,Valid,Measurement,Cancer Modifier +millimeter,8588,mm,millimeter,Unit,Standard,Valid,Unit,UCUM number_of_lesions,4085855,246206008,Number of lesions,Observable Entity,Standard,Valid,Observation,SNOMED response_recist,734317,RECIST,RECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier response_irecist,734318,iRECIST,iRECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier diff --git a/tests/omop/builders/test_condition_occurrence.py b/tests/omop/builders/test_condition_occurrence.py index ce2c714..27a4937 100644 --- a/tests/omop/builders/test_condition_occurrence.py +++ b/tests/omop/builders/test_condition_occurrence.py @@ -593,3 +593,83 @@ def test_fk_publication_deterministic(self, static_index, structural_index): assert ctx_a.condition_id_by_ae_sequence_id == ctx_b.condition_id_by_ae_sequence_id assert ctx_a.condition_id_by_ae_sequence_id != {} + + +class TestPrimaryCancerFKPublication: + """ + Oncology CDM guideline: cancer-modifier Measurement rows (dimensions, + biomarkers, optional future metastasis/node/stage) should link back to the + primary cancer's condition_occurrence_id. ConditionOccurrenceBuilder + publishes that id from the tumor_type emission. + """ + + @staticmethod + def _tumor_semantic(concept_id: int) -> SemanticEntry: + return SemanticEntry( + patient_id=PID, + field_path=(Patient.Singletons.TUMOR_TYPE, TumorType.Fields.ICD10_CODE), + leaf_index=None, + concept_id=concept_id, + name="neoplasm", + domain="condition", + ) + + def test_publishes_primary_cancer_id_from_tumor_type(self, static_index, structural_index): + semantic = create_semantic_index(self._tumor_semantic(4000)) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + tumor = TumorType(patient_id=PID) + tumor.icd10_code = "C50.9" + tumor.date = dt.date(2022, 6, 1) + patient.tumor_type = tumor + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + tumor_row = next(r for r in rows if r.condition_concept_id == 4000) + assert ctx.condition_id_primary_cancer == tumor_row.condition_occurrence_id + + def test_no_primary_cancer_id_when_tumor_type_absent(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ctx = create_build_context(patient, PERSON_ID) + + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert ctx.condition_id_primary_cancer is None + + def test_no_primary_cancer_id_when_tumor_unmapped(self, static_index, structural_index): + """Tumor type present but no semantic match: no row and no FK published.""" + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + tumor = TumorType(patient_id=PID) + tumor.icd10_code = "C99.99" + tumor.date = dt.date(2022, 6, 1) + patient.tumor_type = tumor + ctx = create_build_context(patient, PERSON_ID) + + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert ctx.condition_id_primary_cancer is None + + def test_multi_concept_tumor_picks_first_row_deterministically(self, static_index, structural_index): + """Two semantic matches for the tumor: two rows, FK is first row's id.""" + semantic = create_semantic_index( + self._tumor_semantic(4000), + self._tumor_semantic(4001), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + tumor = TumorType(patient_id=PID) + tumor.icd10_code = "C50.9" + tumor.date = dt.date(2022, 6, 1) + patient.tumor_type = tumor + ctx_a = create_build_context(patient, PERSON_ID) + ctx_b = create_build_context(patient, PERSON_ID) + + rows_a = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_a) + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_b) + + assert len(rows_a) == 2 + assert ctx_a.condition_id_primary_cancer == rows_a[0].condition_occurrence_id + assert ctx_a.condition_id_primary_cancer == ctx_b.condition_id_primary_cancer diff --git a/tests/omop/builders/test_measurement.py b/tests/omop/builders/test_measurement.py index 65da53f..05a9460 100644 --- a/tests/omop/builders/test_measurement.py +++ b/tests/omop/builders/test_measurement.py @@ -13,8 +13,12 @@ from omop_etl.omop.builders.measurement import MeasurementBuilder from omop_etl.omop.builders.visit_occurrence import VisitOccurrenceBuilder from omop_etl.omop.core.id_generator import sha1_bigint +import pytest + from tests.omop.conftest import ( SemanticEntry, + _static, + _structural, create_build_context, create_patient, create_semantic_index, @@ -1489,3 +1493,185 @@ def test_row_id_deterministic(self, static_index, structural_index): rows_2 = MeasurementBuilder(ConceptLookupService(static_index, structural_index, semantic)).build(context) assert rows_1[0].measurement_id == rows_2[0].measurement_id + + +CDM_FIELD_CID = 1147127 +UNIT_MM_CID = 8588 + + +def _with_cdm_field(static_index: dict) -> dict: + """Add the cdm_field static entry used to identify the FK target field.""" + static_index[("cdm_field", "condition_occurrence.condition_occurrence_id")] = _static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + CDM_FIELD_CID, + "metadata", + ) + return static_index + + +def _with_millimeter(structural_index: dict) -> dict: + """Add the millimeter unit concept (UCUM) so lesion-size rows can populate unit_concept_id.""" + structural_index["millimeter"] = _structural("millimeter", UNIT_MM_CID, "unit") + return structural_index + + +class TestPrimaryCancerFKConsumption: + """ + MeasurementBuilder consumes BuildContext.condition_id_primary_cancer + (published by ConditionOccurrenceBuilder) to set + measurement_event_id + meas_event_field_concept_id on lesion-size + (TumorAssessmentBaseline + TumorAssessment) and biomarker rows. + + Cancer modifier rows (lesion size as Dimension of Tumor, biomarkers) link + back to the primary cancer condition via measurement_event_id + + meas_event_field_concept_id, per oncology CDM guidelines. + """ + + @staticmethod + def _baseline_patient() -> Patient: + patient = create_patient(PID, TRIAL) + baseline = TumorAssessmentBaseline(PID) + baseline.target_lesion_size = 41 + baseline.target_lesion_measurement_date = dt.date(2040, 4, 19) + patient.tumor_assessment_baseline = baseline + return patient + + def test_baseline_lesion_size_links_to_primary_cancer(self, static_index, structural_index): + _with_cdm_field(static_index) + _with_millimeter(structural_index) + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 12345 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.measurement_event_id == 12345 + assert row.meas_event_field_concept_id == CDM_FIELD_CID + assert row.unit_concept_id == UNIT_MM_CID + + def test_baseline_lesion_size_no_fk_when_primary_cancer_not_published(self, static_index, structural_index): + _with_millimeter(structural_index) + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + # ctx.condition_id_primary_cancer left as None + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + assert rows[0].measurement_event_id is None + assert rows[0].meas_event_field_concept_id is None + # unit_concept_id is independent of FK linkage: still populated + assert rows[0].unit_concept_id == UNIT_MM_CID + + def test_baseline_lesion_size_unit_missing_falls_back_to_none(self, static_index, structural_index): + """structural index without millimeter: unit_concept_id is None.""" + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + assert rows[0].unit_concept_id is None + + def test_baseline_lesion_size_raises_when_primary_cancer_published_but_cdm_field_missing(self, static_index, structural_index): + """If a primary cancer condition is published, the cdm_field entry is required""" + _with_millimeter(structural_index) + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 12345 + + with pytest.raises(RuntimeError, match="cdm_field"): + MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + def test_tumor_assessment_lesion_size_links_to_primary_cancer(self, static_index, structural_index): + _with_cdm_field(static_index) + _with_millimeter(structural_index) + patient = create_patient(PID, TRIAL) + patient.tumor_assessments = [ + _make_tumor_assessments(dt.date(2040, 6, 14), "V03", size=20.5), + ] + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 67890 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + size_rows = [r for r in rows if r.measurement_concept_id == 4084390] + assert len(size_rows) == 1 + assert size_rows[0].measurement_event_id == 67890 + assert size_rows[0].meas_event_field_concept_id == CDM_FIELD_CID + assert size_rows[0].unit_concept_id == UNIT_MM_CID + + def test_biomarker_links_to_primary_cancer(self, static_index, structural_index): + _with_cdm_field(static_index) + semantic = create_semantic_index( + SemanticEntry( + patient_id=PID, + field_path=(Patient.Singletons.BIOMARKERS, Biomarkers.Fields.COHORT_TARGET_MUTATION), + leaf_index=None, + concept_id=4000, + name="braf non-v600", + domain="measurement", + ) + ) + patient = create_patient(PID, TRIAL) + biomarkers = Biomarkers(PID) + biomarkers.cohort_target_mutation = "BRAF non-V600" + biomarkers.date = dt.date(2040, 1, 1) + patient.biomarkers = biomarkers + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 77777 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index, semantic)).build(ctx) + + assert len(rows) == 1 + assert rows[0].measurement_event_id == 77777 + assert rows[0].meas_event_field_concept_id == CDM_FIELD_CID + + def test_biomarker_no_fk_when_primary_cancer_not_published(self, static_index, structural_index): + semantic = create_semantic_index( + SemanticEntry( + patient_id=PID, + field_path=(Patient.Singletons.BIOMARKERS, Biomarkers.Fields.COHORT_TARGET_MUTATION), + leaf_index=None, + concept_id=4000, + name="braf non-v600", + domain="measurement", + ) + ) + patient = create_patient(PID, TRIAL) + biomarkers = Biomarkers(PID) + biomarkers.cohort_target_mutation = "BRAF non-V600" + biomarkers.date = dt.date(2040, 1, 1) + patient.biomarkers = biomarkers + ctx = create_build_context(patient, PERSON_ID) + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index, semantic)).build(ctx) + + assert len(rows) == 1 + assert rows[0].measurement_event_id is None + assert rows[0].meas_event_field_concept_id is None + + def test_non_cancer_modifier_rows_have_no_fk(self, static_index, structural_index): + """ + ECOG, C30, EQ5D, AE-measurement, MH-measurement rows are not + cancer modifiers and should not link to primary cancer. + Verified here on ECOG (AE/MH rows are tested elsewhere). + """ + _with_cdm_field(static_index) + patient = create_patient(PID, TRIAL) + ecog = EcogBaseline(PID) + ecog.grade = 1 + ecog.date = dt.date(2040, 1, 1) + patient.ecog_baseline = ecog + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 99999 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + # ECOG rows are not cancer modifiers: no FK + assert rows[0].measurement_event_id is None + assert rows[0].meas_event_field_concept_id is None diff --git a/tests/omop/builders/test_observation_builder.py b/tests/omop/builders/test_observation_builder.py index 47b9e41..d452526 100644 --- a/tests/omop/builders/test_observation_builder.py +++ b/tests/omop/builders/test_observation_builder.py @@ -147,8 +147,8 @@ class TestClinicalBenefit: is Yes/No concepts, observation_date is the singleton's date. """ - def _make_singleton( # noqa - self, + @staticmethod + def _make_singleton( *, has_benefit: bool | None, week: int | None = 16, From d6fb13be95b323e712649fb5fc33fcab3daec7c8 Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Thu, 21 May 2026 13:24:50 +0200 Subject: [PATCH 22/23] feat: added normalization file for target biomarkers --- .../biomarker_normalization.csv | 172 ++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv diff --git a/src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv b/src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv new file mode 100644 index 0000000..2ee31d1 --- /dev/null +++ b/src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv @@ -0,0 +1,172 @@ +source_biomarker_name,frequency,normalized_biomarker_name,comment +4q12 amplicon,1,4q12, +9p24.1 amplicon,1,9p24, +A775_G776INSYVMA,1,ERBB2, +ALK fusion,3,ALK fusion, +ALK FUSION IGFBP5,1,ALK fusion, +ALK fusions,10,ALK fusion, +ALK mut,1,ALK mut, +ALK translocation,1,ALK translocation,"Might be a fusion, the trial must confirm" +ATM-ATR,14,ATM-ATR, +BRAF,10,BRAFV600, +BRAF fusion,5,BRAF fusion, +BRAF fusions,1,BRAF fusion, +BRAF non-V600,2,BRAFnonV600, +BRAF Non-V600mut,7,BRAFnonV600, +BRAF nonV600 (BRAF D594G),1,BRAFnonV600, +BRAF nonV600E,1,BRAFnonV600, +BRAF nonV600E (15exon),1,BRAFnonV600, +BRAF nonV600E (BRAF G469A),2,BRAFnonV600, +BRAF V600,14,BRAFV600, +BRAF V600 (3) + BRAF non-V600 (4),1,BRAFV600, +BRAF V600 activating mutations,3,BRAFV600, +BRAF v600E,1,BRAFV600, +BRAF V600E,10,BRAFV600, +BRAF V600E activating mutations,20,BRAFV600, +BRAF V600Emut,14,BRAFV600, +BRAF V600G,1,BRAFV600, +BRAF VAL600 GLU,1,BRAFV600, +BRCA,11,BRCA12, +BRCA1 2 biallelic inactivation,1,BRCA12, +BRCA2 biallelic loss,2,BRCA12, +C.2313_2324DUP P.(TYR772_ALA775DUP),1,ERBB2, +Constitutional mismatch repair deficiency disposition (CMMRD),1,CMMRD, +CMMrd,,CMMRD, +Double hit in PIK3CA/PTEN,1,PIK3CA, +Drug screen,1,Drug screen,"the trial must elaborate, might be organoids-based drug screening and not target-drug-match. Think we should keep them separated from other cohorts." +EBB2 amplification,12,ERBB2 ampl, +EGFR amp,2,EGFR ampl, +EGFR mut,1,EGFR mut, +ERBB2 - L755P,1,ERBB2 mut, +ERBB2 (HER2) Amplification,22,ERBB2 ampl, +ERBB2 (HER2) G776V mutation,1,ERBB2 mut, +ERBB2 (HER2) R678Q mutation,1,ERBB2 mut, +ERBB2 (HER2) S310F mutation,2,ERBB2 mut, +ERBB2 (HER2) S310F MUTATION AND HER2 AMPLIFICATION,1,ERBB2 mut/ampl, +ERBB2 (HER2) S310Y mutation,1,ERBB2 mut, +ERBB2 A775_G776INSYVMA,1,ERBB2 INSYVMA, +ERBB2 C. 2313_2324DUP P.(TYR 772_ALA775DUP) VARIANT DETECTED,1,ERBB2, +ERBB2 C.2313_2324DUP ; P.(TYR772_ALA775DUP) 16%,1,ERBB2, +ERBB2 EXON 20 ACTIVATING VARIANT C.2262_2269DELINSCCCGA P.(LEU755_GLU757DELINSPROLYS),1,ERBB2, +ERBB2 EXON 20 MUTATION C.2326_2327INSTTGTGATGGCTG P.ALA775_GLY776INSVALVALMETALA).,1,ERBB2, +ERBB2 G776>VC,1,ERBB2, +ERBB2 GLY766DELINSVALCYS,1,ERBB2, +ERBB2 mutation,4,ERBB2 mut, +ERBB2 MUTATION A775_G776INSYVMA,1,ERBB2 mut, +ERBB2 MUTATION G778S AND AMPLIFICATION,1,ERBB2 mut/ampl, +ERBB2 P.(GLY776DELINSVALCYS),1,ERBB2, +ERBB2 P.(ILE767MET),1,ERBB2, +ERBB2 P.Y772 A775DUP (EXON 20),1,ERBB2, +ERBB2 P.Y772_A775DUP,1,ERBB2, +ERBB2 TYR772_ALA775DUP,1,ERBB2, +"ERBB2, P.Y772_A775DUP",1,ERBB2, +ERBB2. C2313_2324DUP P.(TYR772_ALA775DUP),1,ERBB2, +ERBB2ampl,10,ERBB2 ampl, +ERBB2mut,3,ERBB2 mut, +ERBB3 mutation,1,ERBB3 mut, +EXON 20 ERB B2 INSERTION (C.2313_2324DUP P.(TYR772_ALA775DUP).PDL1 10%,1,ERBB2, +FGFampl,6,FGF ampl, +FGFR1 ampl,3,FGFR ampl, +FGFR1 amplification,1,FGFR ampl, +FGFR1 double hit,1,FGFR mut/fusion, +FGFR1 fusion,1,FGFR mut/fusion, +FGFR1 fusions,1,FGFR mut/fusion, +FGFR1 mut,1,FGFR mut/fusion, +FGFR2 fusion,2,FGFR mut/fusion, +FGFR2 mut,6,FGFR mut/fusion, +FGFR3 fusion,4,FGFR mut/fusion, +FGFR3 mut,2,FGFR mut/fusion, +FGFR3mut,1,FGFR mut/fusion, +FGFRampl,6,FGFR ampl, +FGFRfusion,8,FGFR mut/fusion, +FGFRmut,9,FGFR mut/fusion, +FRFR2 fusion (ATE1fusion),1,FGFR mut/fusion, +GNA11,1,GNA11, +GNA11 mut,1,GNA11 mut, +GNAQ mut,1,GNAQ mut, +GNAS mut,2,GNAS mut, +HER2 amp,7,ERBB2 ampl, +HER2 ampl,11,ERBB2 ampl, +HER2 ampl/mut,2,ERBB2 mut/ampl, +HER2 AMPLIFICATION (COPY NUMBER 84) AND POINT MUTATION S310Y,1,ERBB2 mut/ampl, +HER2 EXON 20 INSERTION MUTATION,1,ERBB2 mut, +HER2 mut,11,ERBB2 mut, +HER2 postive,1,ERBB2 overexprression, +HER2amp,6,ERBB2 ampl, +HER2exp,2,ERBB2 overexprression, +HER2mut,5,ERBB2 mut, +High tumour mutational burden (TMB),20,TMB high, +HRAS G12D,1,HRAS mut, +HRAS mut,1,HRAS mut, +HRD,11,HRD, +HRR alterations,4,HRR, +HRR defiency,1,HRR, +Hypocellular AML,1,AML, +KRAS (G12V),1,KRAS mut, +KRAS G12D,1,KRAS mut, +KRAS G12S,1,KRAS mut, +KRAS G12V,1,KRAS mut, +KRAS mut,1,KRAS mut, +KRAS mut (G12V and G12D),1,KRAS mut, +"KRAS, NRAS, BRAF",1,RAS-RAF-pathway,"RAS/RAF-pathway, may keep them separated from the KRAS-cohorts" +LTK high,1,LTK high, +MAP2,1,MAP2, +MAP2K1 (MEK1),1,MAP2K1, +"MAP2K1 (MEK1), MAP2K2 (MEK2) or NRAS",1,"MAP2K1, MAP2K2, NRAS", +MAP2K4 loss_mut,1,MAP2K4 mut, +MAP2K4 mut,1,MAP2K4 mut, +MAP3K1 loss_mut,1,MAP3K1 mut, +MAP3K1 mut,1,MAP3K1 mut, +MET amp,3,MET ampl, +MET amplification,7,MET ampl, +MET deletions EXON 14,1,MET exon 14 skip, +MET Exon 14,1,MET exon 14 skip, +MET fusion,5,MET fusion, +Microsatellite instability high (MSI),2,MSI high, +MSI-high,8,MSI high, +MSI-high incl. res,4,MSI high, +MSI-high incl.res,1,MSI high, +MSIhigh,8,MSI high, +NF1,1,NF1, +NF1 amplification,1,NF1 ampl, +NF1 loss,1,NF1, +NF1 mutation,7,NF1 mut, +NF1loss_mut,1,NF1 mut, +NF1mut,1,NF1 mut, +NPM1 mut AML,1,NPM1 mut, +NRAS amp,2,NRAS ampl, +NRAS mut,8,NRAS mut, +NRAS mutation,3,NRAS mut, +NRAS mutation (Q61R),1,NRAS mut, +Other (AN IN-FRAME INSERTION WITHIN ERBB2 EXON 20),1,ERBB2, +Other (ERBB2 (HER2)),1,ERBB2, +Other (ERBB2 EXON 20),1,ERBB2 mut, +Other (ERBB2C.2313_2324DUP),1,ERBB2 mut, +PBRM1,1,PBRM1, +PD-L1,2,PD-L1, +PDGFRA,3,PDGFRA, +PIK3CA DoubleHit incl res,1,PIK3CA, +PIK3CA mut,10,PIK3CA mut, +PIK3CA mut/ampl,1,PIK3CA mut/ampl, +PIK3CAmut,2,PIK3CA mut, +PIK3R1 mut,1,PIK3R1 mut, +PIK3R2 mut,1,PIK3R2 mut, +POLE mut,1,POLE mut, +PTEN loss_mut,3,PTEN loss/mut, +PTENloss,5,PTEN loss, +PTENloss/mut,1,PTEN loss/mut, +ROS1 fusion,1,ROS1 fusion, +SHH-pathway,6,SHH-pathway, +STRN-ALK FUSION,1,ALK fusion, +TMB,5,TMB high, +TMB >= 16,1,TMB high, +TMB >=16,17,TMB high, +TMB >=16 eval MRI,1,TMB high, +TMB >=16 incl. res,1,TMB high, +TMB >=16 incl.res,1,TMB high, +TMB >=16 TMZ ind,3,TMB high, +TMB >=16 TMZ ind incl res,1,TMB high, +TMB>=16,2,TMB high, +TMBhigh,16,TMB high, +TML >=140,2,TML, +V600 MUTATION,1,BRAFV600, \ No newline at end of file From c0a5c38408b672185069bd71e9569f8a1883f0df Mon Sep 17 00:00:00 2001 From: Gabrielstav <49963039+Gabrielstav@users.noreply.github.com> Date: Thu, 21 May 2026 13:34:03 +0200 Subject: [PATCH 23/23] feat: tumor type harmonization file --- ...n.csv => harmonized_target_biomarkers.csv} | 4 +- .../harmonized_tumor_types.csv | 215 ++++++++++++++++++ 2 files changed, 217 insertions(+), 2 deletions(-) rename src/omop_etl/resources/cohort_normalization/{biomarker_normalization.csv => harmonized_target_biomarkers.csv} (95%) create mode 100644 src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv diff --git a/src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv b/src/omop_etl/resources/cohort_normalization/harmonized_target_biomarkers.csv similarity index 95% rename from src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv rename to src/omop_etl/resources/cohort_normalization/harmonized_target_biomarkers.csv index 2ee31d1..6f80d2b 100644 --- a/src/omop_etl/resources/cohort_normalization/biomarker_normalization.csv +++ b/src/omop_etl/resources/cohort_normalization/harmonized_target_biomarkers.csv @@ -1,4 +1,4 @@ -source_biomarker_name,frequency,normalized_biomarker_name,comment +source_biomarker_name,frequency,harmonized_biomarker_name,comment 4q12 amplicon,1,4q12, 9p24.1 amplicon,1,9p24, A775_G776INSYVMA,1,ERBB2, @@ -33,7 +33,7 @@ C.2313_2324DUP P.(TYR772_ALA775DUP),1,ERBB2, Constitutional mismatch repair deficiency disposition (CMMRD),1,CMMRD, CMMrd,,CMMRD, Double hit in PIK3CA/PTEN,1,PIK3CA, -Drug screen,1,Drug screen,"the trial must elaborate, might be organoids-based drug screening and not target-drug-match. Think we should keep them separated from other cohorts." +Drug screen,1,Drug screen,"The trial must elaborate: might be organoid-based drug screening and not target-drug-match. Think we should keep them separated from other cohorts." EBB2 amplification,12,ERBB2 ampl, EGFR amp,2,EGFR ampl, EGFR mut,1,EGFR mut, diff --git a/src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv b/src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv new file mode 100644 index 0000000..4e13596 --- /dev/null +++ b/src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv @@ -0,0 +1,215 @@ +source_tumor_type_name,frequency,harmonized_tumor_type_name,tumor_subtype,general_tumor_type,comment +"79F, METASTATIC CARCINOMA OF ANUS, HER2 AMPLIFIED",1,Anal cancer,,Anal cancer +acinic cell carsinoma,1,Acinic cell carcinoma,Acinic cell carcinoma,Salivary gland cancer +Adenoic cystic carcinoma,1,Adenoid cystic carcinoma,,Adenoid cystic carcinoma +Adrenocortical carcinoma,1,Adrenocortical carcinoma,,Adrenocortical carcinoma +ALK-POSITIVE LARGE B-CELL LYMPHOMA,1,Large B-cell lymphoma,Large B-cell lymphoma,Haematological cancer +ameloblastoma,1,Ameloblastoma,,Ameloblastoma +Ameloblastoma,1,Ameloblastoma,,Ameloblastoma +AML,1,AML,AML,Haematological cancer +anal,1,Anal cancer,,Anal cancer +anal canal squamous cell ca,1,Anal cancer,,Anal cancer +Anal cancer,1,Anal cancer,,Anal cancer +Anal carcinoma,2,Anal cancer,,Anal cancer +ANAPLASTIC ALK-POSITIVE LARGE CELL LYMPHOMA,1,Anaplastic large cell lymphoma,Anaplastic large cell lymphoma,Haematological cancer +Anaplastic large cell lymphoma,2,Anaplastic large cell lymphoma,Anaplastic large cell lymphoma,Haematological cancer +anaplastic thyroid cancer,1,Anaplastic thyroid cancer,Anaplastic thyroid cancer,Thyroid cancer +Apocrine carcinoma,1,Apocrine carcinoma,,Apocrine carcinoma +Astrocytoma,1,Astrocytoma,Astrocytoma,CNS +bile duct & gall bladder,1,Cholangiocarcinoma,,Cholangiocarcinoma +bile duct & gallbladder,5,Cholangiocarcinoma,,Cholangiocarcinoma +bile duct and gallbladder,1,Cholangiocarcinoma,,Cholangiocarcinoma +Biliary tract carcinoma,2,Cholangiocarcinoma,,Cholangiocarcinoma +Biliary tract carcinoma/galbladder carcinoma,1,Cholangiocarcinoma,,Cholangiocarcinoma +bladder & urinary tract,6,Urothelial cancer,,Urothelial cancer +Bladder / urinary tract cancer,2,Urothelial cancer,,Urothelial cancer +Bladder cancer,3,Urothelial cancer,,Urothelial cancer +Bladder/Urinary Tract Cancer,2,Urothelial cancer,,Urothelial cancer +brain ependymoma,1,Ependymoma,Ependymoma,CNS +breast,6,Breast cancer,,Breast cancer +Breast cancer,9,Breast cancer,,Breast cancer +Breast Cancer,1,Breast cancer,,Breast cancer +"breast, non-TNBC",1,Breast cancer,,Breast cancer +"breast, TNBC",1,Breast cancer,,Breast cancer +Central Nervous System/Brain,4,CNS,,CNS +cervical cancer,2,Cervical cancer,Cervical cancer,Gynaecological cancer +Cervical cancer,5,Cervical cancer,Cervical cancer,Gynaecological cancer +Cervical Cancer,2,Cervical cancer,Cervical cancer,Gynaecological cancer +cervix,1,Cervical cancer,Cervical cancer,Gynaecological cancer +Cholangio carcinoma,5,Cholangiocarcinoma,,Cholangiocarcinoma +Cholangiocarcicoma,1,Cholangiocarcinoma,,Cholangiocarcinoma +Cholangiocarcinoma,16,Cholangiocarcinoma,,Cholangiocarcinoma +chordoma,1,Chordoma,,Chordoma +CLEAR CELL ADENOCARCINOMA OF THE LEFT URETER,1,Urothelial cancer,,Urothelial cancer +Clear cell odontogenic carcinoma,1,Clear cell odontogenic carcinoma,,Clear cell odontogenic carcinoma +CLL,1,CLL,CLL,Haematological cancer +CNS tumor,1,CNS,,CNS +Colorectal Cancer,5,Colorectal cancer,Colorectal cancer,Intestinal cancer +conjuctival melanoma (eye),1,Conjunctival melanoma,Conjunctival melanoma,Ocular melanoma +CRC,21,Colorectal cancer,Colorectal cancer,Intestinal cancer +CRPC,2,Prostate cancer,,Prostate cancer +CUP,11,CUP,,CUP +Duodenal carcinoma,1,Duodenal carcinoma,Duodenal carcinoma,Intestinal cancer +Eccrine carcinoma,1,Eccrine carcinoma,,Eccrine carcinoma +endometrial,2,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Endometrial caccer,1,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Endometrial cancer,6,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Endometrial/Uterine Cancer,2,Endometrial cancer,Endometrial cancer,Gynaecological cancer +endometroid ovary cancer,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Erdheim Chester,1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +Erdheim Chester / Histiocytic,1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +"ERDHEIM CHESTER DISEASE - HISTIOCYTOSIS",1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +"ERDHEIM CHESTER DISEASE (ORBITAL, CARDIAC AND BONE INVOLVEMENT)",1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +"ERDHEIM CHESTER DISEASE, BRAF V600E MUTATION",1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +esophageal,1,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Esophageal,1,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Esophageal cancer,2,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Esophageal_Gastric cancer,1,Esophageal or gastric cancer,Esophageal or gastric cancer,Upper GI-cancer +esophagus,3,Esophageal cancer,Esophageal cancer,Upper GI-cancer +esophagus ca,1,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Fallopian tube cancer,1,Fallopian tube cancer,Fallopian tube cancer,Gynaecological cancer +Fallopian Tube Cancer,1,Fallopian tube cancer,Fallopian tube cancer,Gynaecological cancer +FALLOPIAN TUBE CANCER,1,Fallopian tube cancer,Fallopian tube cancer,Gynaecological cancer +gallbladder ca,1,Cholangiocarcinoma,,Cholangiocarcinoma +gastric,1,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +Gastric cancer,1,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +Gastroesophageal Cancer,3,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +GBM,6,Glioblastoma,Glioblastoma,CNS +GEJ,5,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +GEJ carcinoma,1,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +Germ cell cancer,1,Germ cell tumor,,Germ cell tumor +germ cell tumor,2,Germ cell tumor,,Germ cell tumor +gist,1,GIST,,GIST +GIST,2,GIST,,GIST +glioblastoma,1,Glioblastoma,Glioblastoma,CNS +Glioblastoma Multiforme,1,Glioblastoma,Glioblastoma,CNS +glioma (high grade),2,High-grade glioma,High-grade glioma,CNS +Glioneuronal_Neuronal Tumor,1,Glioneuronal tumor,Glioneuronal tumor,CNS +Goblet cell carcinoma,1,Goblet cell carcinoma,Goblet cell carcinoma,Intestinal cancer +Grade 3 glioma,1,Grade 3 glioma,Grade 3 glioma,CNS +grade II glioma,1,Grade 2 glioma,Grade 2 glioma,CNS +grade III glioma,3,Grade 3 glioma,Grade 3 glioma,CNS +Gynaecological tumors,2,Gynaecological cancer,,Gynaecological cancer +Hairy cell leukaemia,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +HAIRY CELL LEUKAEMIA,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +Hairy cell leukemia,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +HAIRY CELL LEUKEMIA,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +HCC,1,Hepatocellular carcinoma,,Hepatocellular carcinoma +head & neck,4,Head and neck cancer,,Head and neck cancer +Head and Neck Cancer,5,Head and neck cancer,,Head and neck cancer +Hepatobiliary Cancer,6,Cholangiocarcinoma,,Cholangiocarcinoma +Hidradenocarcinoma,1,Hidradenocarcinoma,,Hidradenocarcinoma +High grade glioma,11,High-grade glioma,High-grade glioma,CNS +HIGH GRADE GLIOMA,1,High-grade glioma,High-grade glioma,CNS +high grade serose ovary ca,4,Ovarian cancer,Ovarian cancer,Gynaecological cancer +HNnonSCC,1,Head and neck cancer,,Head and neck cancer +HNSCC,3,Head and neck cancer,,Head and neck cancer +intestinal,5,Intestinal cancer,,Intestinal cancer +intestinal adecoca,1,Intestinal cancer,,Intestinal cancer +Kidney Cancer,1,Kidney cancer,,Kidney cancer +Langerhans Cell Histiocytosis,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +LANGERHANS CELL HISTIOCYTOSIS,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +LANGERHANS CELL HISTO HISTIOCYTOSIS,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +Langerhans cells histocytosis,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +LGSOC,2,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Low grade glioma,1,Low-grade glioma,Low-grade glioma,CNS +low grade ovarian serose ca,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +lung ca,4,Lung cancer,,Lung cancer +Lymphoma,1,Lymphoma,Lymphoma,Haematological cancer +Malignant Pleural Mesothelioma,1,Mesothelioma,,Mesothelioma +MALIGNANT TUMOUR OF UNKNOWN ORIGIN,1,CUP,,CUP +MDS,2,MDS,MDS,Haematological cancer +Melanocytic tumor,1,Melanocytic tumor,,Melanocytic tumor +melanoma,3,Melanoma,Melanoma,Skin cancer +Melanoma,3,Melanoma,Melanoma,Skin cancer +mesothelioma,1,Mesothelioma,,Mesothelioma +Metastatic adenocarcinoma of left parotid,1,Parotid gland carcinoma,Parotid gland carcinoma,Salivary gland cancer +METASTATIC ADENOCARCINOMA OF LIKELY PANCREATIC ORIGIN,1,Pancreatic cancer,,Pancreatic cancer +METASTATIC AMPULLARY ADENOCARCINOMA,1,Ampullary cancer,Ampullary cancer,Intestinal cancer +METASTATIC ANAL CANCER,1,Anal cancer,,Anal cancer +METASTATIC BASALOID PSEUDO-CRIBIFORM EPITHELOID CARCINOMA,1,Epitheloid carcinoma,,Epitheloid carcinoma +METASTATIC LUNG ADENOCARCINOMA,1,Lung cancer,,Lung cancer +METASTATIC PERITONEAL MESOTHELIOMA,1,Mesothelioma,,Mesothelioma +metastatic ventricular ca,1,Ventricular cancer,Ventricular cancer,CNS +MM,1,Multiple Myeloma,Multiple Myeloma,Haematological cancer +MPM,1,Mesothelioma,,Mesothelioma +Multifocal glioma,1,Glioma,Glioma,CNS +NEC,11,Neuro-endocrine carcinoma,,Neuro-endocrine carcinoma +NEC (gastrointestinal),1,Neuro-endocrine carcinoma,,Neuro-endocrine carcinoma +NET,7,Neuro-endocrine tumor,,Neuro-endocrine tumor +Neuroendocrine and Adrenal Tumour,2,Neuro-endocrine tumor,,Neuro-endocrine tumor +NON-HODGKIN'S LYMPHOMA,1,Non-Hodgkin lymphoma,Non-Hodgkin lymphoma,Haematological cancer +NON-LANGERHANS HISTIOCYTIC DISORDER (ERDHEIM-CHESTER DISEASE),1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +Non-Small Cell Lung Cancer,27,NSCLC,NSCLC,Lung cancer +NSCLC,24,NSCLC,NSCLC,Lung cancer +NUD,1,CUP,,CUP +Occult Primary or Cancer of Unknown Primary,2,CUP,,CUP +ORAL CAVITY SQUAMOUS CARCINOMA,1,Head and neck cancer,,Head and neck cancer +ovarian,7,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovarian ca,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovarian cancer,12,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovarian Cancer,2,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovarian mucinous carsinoma,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovario ca,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovario high grade clear cell carcinoma (HGCC),1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovary high grade serose cancer (HGSC),1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovary low grade serous cancer,3,Ovarian cancer,Ovarian cancer,Gynaecological cancer +pancreas,5,Pancreatic cancer,,Pancreatic cancer +pancreatic cancer,2,Pancreatic cancer,,Pancreatic cancer +Pancreatic cancer,8,Pancreatic cancer,,Pancreatic cancer +Pancreatic Cancer,3,Pancreatic cancer,,Pancreatic cancer +Papillary craniofaryngeoma,1,Papillary craniofaryngeoma,Papillary craniofaryngeoma,CNS +paraganglioma,1,Paraganglioma,Paraganglioma,CNS +penile,2,Penile cancer,,Penile cancer +Penile cancer,1,Penile cancer,,Penile cancer +peritoneal mesothelioma,1,Mesothelioma,,Mesothelioma +PERITONEAL MESOTHELIOMA,1,Mesothelioma,,Mesothelioma +pleural mesothelioma,1,Mesothelioma,,Mesothelioma +Primary brain tumors,2,CNS,,CNS +prostate,4,Prostate cancer,,Prostate cancer +Prostate cancer,10,Prostate cancer,,Prostate cancer +Prostate Cancer,4,Prostate cancer,,Prostate cancer +Prostate carcinoma,1,Prostate cancer,,Prostate cancer +PTO,1,CUP,,CUP +RCC,2,Renal cell carcinoma,,Renal cell carcinoma +RELAPSED METASTATIC SALIVARY DUCT CARCINOMA (SDC),1,Salivary duct carcinoma,Salivary duct carcinoma,Salivary gland cancer +Renal Cell Carcinoma,1,Renal cell carcinoma,,Renal cell carcinoma +salivary duct ca,1,Salivary duct carcinoma,Salivary duct carcinoma,Salivary gland cancer +Salivary duct carcinoma,2,Salivary duct carcinoma,Salivary duct carcinoma,Salivary gland cancer +salivary gland,2,Salivary gland cancer,,Salivary gland cancer +Salivary gland adenocarcinoma,1,Salivary gland cancer,,Salivary gland cancer +Salivary gland cancer,7,Salivary gland cancer,,Salivary gland cancer +Salivary gland carcinoma,2,Salivary gland cancer,,Salivary gland cancer +sarcoma,1,Sarcoma,,Sarcoma +Sarcoma,4,Sarcoma,,Sarcoma +SCLC,3,SCLC,SCLC,Lung cancer +sigma adenocarcinoma (2xquick PD),1,Left-sided adenocarcinoma,,Left-sided adenocarcinoma +Skin cancer,2,Skin cancer,,Skin cancer +Small Bowel Carcinoma,3,Small intestine cancer,Small intestine cancer,Intestinal cancer +Small intestine cancer,4,Small intestine cancer,Small intestine cancer,Intestinal cancer +soft tissue sarcoma,1,Soft tissue sarcoma,Soft tissue sarcoma,Sarcoma +stomach adenocarsinoma,1,Gastric cancer,Gastric cancer,Upper GI-cancer +submandibular cland adenocystic carsinoma,1,Adenoid cystic carcinoma,,Adenoid cystic carcinoma +synovial sarcoca,1,Synovial sarcoma,Synovial sarcoma,Sarcoma +thymic,1,Thymoma,,Thymoma +thyroid,1,Thyroid cancer,,Thyroid cancer +thyroid cancer,2,Thyroid cancer,,Thyroid cancer +Thyroid cancer,3,Thyroid cancer,,Thyroid cancer +Thyroid Cancer,4,Thyroid cancer,,Thyroid cancer +Thyroid carcinoma,1,Thyroid cancer,,Thyroid cancer +tongue ca,1,Tongue cancer,,Tongue cancer +Tumor-agnostic,14,Tumor-agnostic,,Tumor-agnostic +Tumor agnostic,12,Tumor-agnostic,,Tumor-agnostic +UCC,1,Urothelial cancer,,Urothelial cancer +unknown primary adenoid cystic carsinoma,1,Adenoid cystic carcinoma,,Adenoid cystic carcinoma +Upper GI-tumors,1,Upper GI-cancer,,Upper GI-cancer +Upper rectal adenocarcinoma,1,Colorectal cancer,Colorectal cancer,Intestinal cancer +Urothelial cancer,3,Urothelial cancer,,Urothelial cancer +Uterine cancer,1,Uterine cancer,Uterine cancer,Gynaecological cancer +uterus endometrial adenocarcinoma,1,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Uveal melanoma,2,Uveal melanoma,Uveal melanoma,Ocular melanoma +vaginal adeno carsinoma,2,Vaginal cancer,Vaginal cancer,Gynaecological cancer +Vaginal cancer,1,Vaginal cancer,Vaginal cancer,Gynaecological cancer +Vaginal Cancer,1,Vaginal cancer,Vaginal cancer,Gynaecological cancer +Vulvar Cancer,1,Vulvar cancer,Vulvar cancer,Gynaecological cancer +Vulvarian cancer,1,Vulvar cancer,Vulvar cancer,Gynaecological cancer