diff --git a/main.py b/main.py index 4323338..4faa26c 100644 --- a/main.py +++ b/main.py @@ -62,7 +62,7 @@ def run_pipeline(preprocessing_input: Path, base_root: Path, trial: str = "IMPRE meta=_meta, ) - print(f"Harmonized: {harmonized_result.patients[0:10]}") + # print(f"Harmonized: {harmonized_result.patients[0:10]}") # run semantic mapping semantic_mapper = SemanticService(outdir=base_root, layout=Layout.TRIAL_TIMESTAMP_RUN) @@ -89,7 +89,7 @@ def run_pipeline(preprocessing_input: Path, base_root: Path, trial: str = "IMPRE tables: OmopTables = omop_service.build(harmonized_result.patients) # todo: remove - print(f"built tables: {tables}") + # print(f"built tables: {tables}") # export concept lookup tracking (missed lookups, coverage stats) concept_service.export(formats="csv") diff --git a/src/omop_etl/concept_mapping/core/models.py b/src/omop_etl/concept_mapping/core/models.py index 32f4fc5..4caa323 100644 --- a/src/omop_etl/concept_mapping/core/models.py +++ b/src/omop_etl/concept_mapping/core/models.py @@ -6,7 +6,7 @@ def _norm(v: str | None) -> str: - """Lowercase + strip a CSV value, defaulting None to empty string.""" + """Lowercase and strip a CSV value, defaulting None to empty string.""" return (v or "").lower().strip() diff --git a/src/omop_etl/concept_mapping/service.py b/src/omop_etl/concept_mapping/service.py index ccba401..9db0307 100644 --- a/src/omop_etl/concept_mapping/service.py +++ b/src/omop_etl/concept_mapping/service.py @@ -157,7 +157,7 @@ def lookup_static( validity=c.validity, ) if not _concept_matches_filter(concept, domains, vocabs, validity): - self._result.record_miss("static", value_set, local_value) + # concept mapped but rejected by filter return None self._result.record_match("static", value_set, local_value, concept) @@ -198,7 +198,7 @@ def lookup_structural( validity=c.validity, ) if not _concept_matches_filter(concept, domains, vocabs, validity): - self._result.record_miss("structural", value_set, "") + # concept mapped but rejected by filter return None self._result.record_match("structural", value_set, "", concept) diff --git a/src/omop_etl/harmonization/harmonizers/base.py b/src/omop_etl/harmonization/harmonizers/base.py index b524066..07ac2d8 100644 --- a/src/omop_etl/harmonization/harmonizers/base.py +++ b/src/omop_etl/harmonization/harmonizers/base.py @@ -66,6 +66,7 @@ class CollectionSpec(SpecBase): order_by: tuple[str, ...] = () require_order_by: bool = False items_col: str = "items" + on_natural_key_conflict: Literal["error", "warn"] = "warn" # union type for all specs @@ -84,6 +85,37 @@ def _derived_name(fn: Callable[..., Any]) -> str: return name.removeprefix("_process_") +def _check_natural_key_conflicts( + objs: list[DomainBase], + *, + patient_id: str, + item_type: type[DomainBase], + policy: Literal["error", "warn"], +) -> None: + """ + Detect natural-key collisions where the rows have differing data. + + Identical duplicates (same NK, same data) are assumed to be deduplicated + upstream by the collection processor, so this only flags conflicts. + Keeps the first occurrence. + """ + seen: dict[tuple, DomainBase] = {} + fields = item_type.data_fields() + for obj in objs: + nk = obj.natural_key() + prior = seen.get(nk) + if prior is None: + seen[nk] = obj + continue + if all(getattr(prior, f) == getattr(obj, f) for f in fields): + continue + diffs = {f: (getattr(prior, f), getattr(obj, f)) for f in fields if getattr(prior, f) != getattr(obj, f)} + msg = f"{item_type.__name__} natural-key conflict for patient {patient_id}: NK={nk} has conflicting values: {diffs}" + if policy == "error": + raise ValueError(msg) + log.warning(msg) + + def scalar( *, name: str | None = None, @@ -166,6 +198,7 @@ def collection( skip_missing_patients: bool = False, subject_col: str = "SubjectId", strict_schema: bool | None = None, + on_natural_key_conflict: Literal["error", "warn"] = "warn", ) -> Callable[[_F], _F]: """ Decorator: register a method as a collection-domain processor. @@ -187,6 +220,7 @@ def decorator(fn: _F) -> _F: skip_missing_patients=skip_missing_patients, subject_col=subject_col, strict_schema=strict_schema, + on_natural_key_conflict=on_natural_key_conflict, ) setattr(fn, _SPEC_ATTR, spec) return fn @@ -418,6 +452,7 @@ def _run_spec(self, spec: ProcessorSpec) -> None: items_col=spec.items_col, skip_missing_patients=spec.skip_missing_patients, mode=spec.mode, + on_natural_key_conflict=spec.on_natural_key_conflict, ) elif isinstance(spec, SingletonSpec): @@ -595,6 +630,7 @@ def hydrate_collection_field( item_type: type[DomainBase], patients: dict[str, Patient], mode: Literal["replace", "extend"] = "replace", + on_natural_key_conflict: Literal["error", "warn"] = "warn", ) -> None: """ Instantiate collection domain models onto Patient after schema validation. @@ -611,6 +647,9 @@ def hydrate_collection_field( item_type: Target domain class (used to resolve Patient attribute). patients: Map of patient_id to Patient instance. mode: "replace" overwrites, "extend" appends to existing collection. + on_natural_key_conflict: "warn" logs a warning when two instances share a natural key + but differ in other field values; "error" raises ValueError. Identical duplicates + (same NK, same data) are assumed to be deduplicated upstream. """ target_attr = Patient.get_attr_for_type(item_type) build = builder or item_type.from_row @@ -627,6 +666,14 @@ def hydrate_collection_field( except Exception as e: raise ValueError(f"{item_type.__name__} collection hydration failed for {sid=}") from e + if item_type.NATURAL_KEY_FIELDS: + _check_natural_key_conflicts( + objs, + patient_id=sid, + item_type=item_type, + policy=on_natural_key_conflict, + ) + if mode == "extend": existing = getattr(patient, target_attr, ()) or () objs = list(existing) + objs diff --git a/src/omop_etl/harmonization/harmonizers/impress.py b/src/omop_etl/harmonization/harmonizers/impress.py index bea4852..cc78fa6 100644 --- a/src/omop_etl/harmonization/harmonizers/impress.py +++ b/src/omop_etl/harmonization/harmonizers/impress.py @@ -1,5 +1,4 @@ import re -from deprecated import deprecated import polars as pl from logging import getLogger @@ -9,12 +8,13 @@ from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.c30 import C30 +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.ecog_baseline import EcogBaseline from omop_etl.harmonization.models.domain.eq5d import EQ5D from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment import TumorAssessment @@ -170,14 +170,18 @@ def _process_number_of_serious_adverse_events(self) -> pl.DataFrame | None: ) return sae_counts - @scalar() - def _process_has_clinical_benefit_at_week_16(self) -> pl.DataFrame | None: + @singleton(ClinicalBenefit) + def _process_clinical_benefit(self) -> pl.DataFrame: """ - Clinical benefit at W16 (visit 3). - Note: If patient has iRecist *and* Recist at same assessment, - iRecist evaluation takes precedence as it's a more specific assessment. + Clinical benefit at W16 at visit 3. + Priority for the answer and its date: iRecist (RA_RAiMODCD) > Recist + (RA_RATIMRESCD) > RNRSP_RNRSPCLCD. iRecist and Recist both date from + RA_EventDate, RNRSP uses RNRSP_EventDate. When no source registers a + benefit, the row is False and the date falls back to whichever V03 + date is available (coalesce RA_EventDate, RNRSP_EventDate). Collapsed + to one row per SubjectId. """ - colname = Patient.Scalars.HAS_CLINICAL_BENEFIT_AT_WEEK_16 + cols = ClinicalBenefit.Fields timepoint = "V03" benefit = ( @@ -185,22 +189,46 @@ def _process_has_clinical_benefit_at_week_16(self) -> pl.DataFrame | None: "SubjectId", "RA_RATIMRESCD", "RA_RAiMODCD", + "RA_EventId", + "RA_EventDate", "RNRSP_RNRSPCLCD", "RNRSP_EventId", - "RA_EventId", + "RNRSP_EventDate", ) .filter(pl.any_horizontal(pl.all().exclude("SubjectId").is_not_null())) .filter((pl.col("RA_EventId") == timepoint) | (pl.col("RNRSP_EventId") == timepoint)) .with_columns( - pl.when(PolarsParsers.to_optional_int64(pl.col("RA_RATIMRESCD")).le(3)) + row_has_benefit=pl.when(PolarsParsers.to_optional_int64(pl.col("RA_RAiMODCD")).le(3)) .then(True) - .when(PolarsParsers.to_optional_int64(pl.col("RA_RAiMODCD")).le(3)) + .when(PolarsParsers.to_optional_int64(pl.col("RA_RATIMRESCD")).le(3)) .then(True) .when(PolarsParsers.to_optional_int64(pl.col("RNRSP_RNRSPCLCD")).le(3)) .then(True) - .otherwise(False) - .alias(colname) + .otherwise(False), + row_date=pl.when(PolarsParsers.to_optional_int64(pl.col("RA_RAiMODCD")).le(3)) + .then(PolarsParsers.to_optional_date("RA_EventDate")) + .when(PolarsParsers.to_optional_int64(pl.col("RA_RATIMRESCD")).le(3)) + .then(PolarsParsers.to_optional_date("RA_EventDate")) + .when(PolarsParsers.to_optional_int64(pl.col("RNRSP_RNRSPCLCD")).le(3)) + .then(PolarsParsers.to_optional_date("RNRSP_EventDate")) + .otherwise( + pl.coalesce( + PolarsParsers.to_optional_date("RA_EventDate"), + PolarsParsers.to_optional_date("RNRSP_EventDate"), + ) + ), + ) + .group_by("SubjectId") + .agg( + pl.col("row_has_benefit").any().alias(cols.HAS_BENEFIT), + pl.col("row_date").filter(pl.col("row_has_benefit")).first().alias("date_from_benefit"), + pl.col("row_date").first().alias("date_fallback"), ) + .with_columns( + pl.coalesce("date_from_benefit", "date_fallback").alias(cols.DATE), + pl.lit(16, dtype=pl.Int64).alias(cols.WEEK), + ) + .select("SubjectId", cols.WEEK, cols.HAS_BENEFIT, cols.DATE) ) return benefit @@ -221,24 +249,18 @@ def _process_end_of_treatment_reason(self) -> pl.DataFrame | None: def _process_evaluable_for_efficacy_analysis(self) -> pl.DataFrame | None: """ Filtering criteria: - Any patient having valid treatment for sufficient length (21 days IV, 28 days oral). - For IV cycles, the cycle end is modeled as the day before the next cycles start. - Inclusive length = next_start − start days. Length ≥ 21 qualifies. - For oral cycles, length = stop − start days; ≥ 28 qualifies. + Any patient having valid treatment for sufficient length (21 days IV, 28 days oral). + For IV cycles, the cycle end is modeled as the day before the next cycles start. + Inclusive length = next_start − start days. Length ≥ 21 qualifies. + For oral cycles, length = stop − start days; ≥ 28 qualifies. For subjects with oral drugs, the start and end date per cycle is checked directly. - If a subject has any cycle lasting 28 days or more they are marked as having sufficient treatment length + If a subject has any cycle lasting 28 days or more they are marked as having sufficient treatment length For subjects without oral drugs, cycle stop date is set to start date of next cycle and needs to last 21 days or more. - Note: this means subjects with just one cycle are marked as non-evaluable since cycle end cannot be determined. - each cycle is grouped by treatment number, any treatment having a cycle with sufficient length marks subject as evaluable. - assumes no malformed dates, because imputing would change the length. - - Old filteing criteria: - Patients marked as evaluable for efficacy analysis needs to have: - - sufficient treatment length for any cycle (21 days for IV, 28 days for oral) and *either one of*: - - tumor assessment after week 4 (patient has any tumor assessment with EventId==V04 in RA, RCNT, RTNTMNT, RNRSP) - - clinical assessment (patient has stopped treatment: EventDate from EOT sheet) + Note: this means subjects with just one cycle are marked as non-evaluable since cycle end cannot be determined. + each cycle is grouped by treatment number, any treatment having a cycle with sufficient length marks subject as evaluable. + assumes no malformed dates, because imputing would change the length. """ colname = Patient.Scalars.EVALUABLE_FOR_EFFICACY_ANALYSIS evaluability_data = self.data.select( @@ -248,16 +270,6 @@ def _process_evaluable_for_efficacy_analysis(self) -> pl.DataFrame | None: "TR_TRTNO", "TR_TRC1_DT", "TR_TRCYNCD", - # not currently used: - # "RA_EventDate", - # "RA_EventId", - # "RNRSP_EventDate", - # "RNRSP_EventId", - # "RCNT_EventDate", - # "RCNT_EventId", - # "RNTMNT_EventDate", - # "RNTMNT_EventId", - # "EOT_EventDate", ) def oral_treatment_lengths() -> pl.DataFrame: @@ -306,35 +318,6 @@ def iv_treatment_lengths() -> pl.DataFrame: return iv_sufficient_treatment_length - @deprecated - def eot_filter() -> pl.DataFrame: - has_ended_treatment = evaluability_data.group_by("SubjectId").agg( - pl.any_horizontal(PolarsParsers.to_optional_utf8(pl.col(["EOT_EventDate"])).str.len_bytes() > 0).any().alias("has_clinical_assessment"), - ) - return has_ended_treatment - - @deprecated - def tumor_assessment() -> pl.DataFrame: - # need to add V04 filter (if this is to be used again) - has_tumor_assessment_week_4 = evaluability_data.group_by("SubjectId").agg( - pl.any_horizontal( - PolarsParsers.to_optional_utf8( - pl.col( - [ - "RA_EventDate", - "RNRSP_EventDate", - "RCNT_EventDate", - "RNTMNT_EventDate", - ], - ), - ).str.len_bytes() - > 0, - ) - .any() - .alias("has_tumor_assessment"), - ) - return has_tumor_assessment_week_4 - def _merge_evaluability() -> pl.DataFrame: base = evaluability_data.select("SubjectId").unique() _merged_df: pl.DataFrame = ( @@ -534,6 +517,7 @@ def _process_study_drugs(self) -> pl.DataFrame: s2cd=PolarsParsers.to_optional_int64(pl.col("COH_COHALLO2__2CD")), s3=PolarsParsers.to_optional_utf8(pl.col("COH_COHALLO2__3")).str.strip_chars(), s3cd=PolarsParsers.to_optional_int64(pl.col("COH_COHALLO2__3CD")), + date=PolarsParsers.to_optional_date(pl.col("COH_EventDate")), ) # require at least one present .filter( @@ -584,7 +568,12 @@ def _process_study_drugs(self) -> pl.DataFrame: .sort("_row") .unique(subset=["SubjectId"], keep="last") .select( - "SubjectId", cols.PRIMARY_TREATMENT_DRUG, cols.PRIMARY_TREATMENT_DRUG_CODE, cols.SECONDARY_TREATMENT_DRUG, cols.SECONDARY_TREATMENT_DRUG_CODE + "SubjectId", + cols.PRIMARY_TREATMENT_DRUG, + cols.PRIMARY_TREATMENT_DRUG_CODE, + cols.SECONDARY_TREATMENT_DRUG, + cols.SECONDARY_TREATMENT_DRUG_CODE, + cols.DATE, ) ) @@ -719,9 +708,9 @@ def merge_medical_history(base: pl.DataFrame, processed: pl.DataFrame) -> pl.Dat return merged - @collection(PreviousTreatments, order_by=("start_date",), require_order_by=True) + @collection(PreviousTreatment, order_by=("start_date",), require_order_by=True) def _process_previous_treatments(self) -> pl.DataFrame | None: - cols = PreviousTreatments.Fields + cols = PreviousTreatment.Fields ct_base = self.data.select( "SubjectId", "CT_CTTYPE", @@ -1024,7 +1013,11 @@ def filter_concomitant_data(frame: pl.DataFrame) -> pl.DataFrame: return filtered - @collection(AdverseEvent, order_by=("start_date",), require_order_by=True) + @collection( + AdverseEvent, + order_by=("start_date", "sequence_id"), + require_order_by=True, + ) def _process_adverse_events(self) -> pl.DataFrame | None: cols = AdverseEvent.Fields ae_base = self.data.select( @@ -1044,6 +1037,7 @@ def _process_adverse_events(self) -> pl.DataFrame | None: "AE_AESERCD", "AE_SAEEXP1CD", "AE_SAEEXP2CD", + "AE_AESPID", "FU_FUPDEDAT", "TR_TRNAME", "TR_TRTNO", @@ -1054,6 +1048,7 @@ def parse_events(frame: pl.DataFrame) -> pl.DataFrame: PolarsParsers.to_optional_date(pl.col("AE_AESTDAT")).alias(cols.START_DATE), PolarsParsers.to_optional_date(pl.col("AE_AEENDAT")).alias(cols.END_DATE), PolarsParsers.to_optional_date(pl.col("AE_SAESTDAT")).alias(cols.TURNED_SERIOUS_DATE), + PolarsParsers.to_optional_int64(pl.col("AE_AESPID")).alias(cols.SEQUENCE_ID), PolarsParsers.int_to_bool( true_int=1, false_int=0, @@ -1137,6 +1132,7 @@ def coerce(frame: pl.DataFrame) -> pl.DataFrame: cols.TREATMENT_2_NAME, cols.WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_1, cols.WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_2, + cols.SEQUENCE_ID, ) ) diff --git a/src/omop_etl/harmonization/models/domain/adverse_event.py b/src/omop_etl/harmonization/models/domain/adverse_event.py index b22100f..8ee9e05 100644 --- a/src/omop_etl/harmonization/models/domain/adverse_event.py +++ b/src/omop_etl/harmonization/models/domain/adverse_event.py @@ -19,6 +19,7 @@ class Fields: OUTCOME = "outcome" START_DATE = "start_date" END_DATE = "end_date" + SEQUENCE_ID = "sequence_id" WAS_SERIOUS = "was_serious" TURNED_SERIOUS_DATE = "turned_serious_date" RELATED_TO_TREATMENT_1_STATUS = "related_to_treatment_1_status" @@ -29,6 +30,7 @@ class Fields: WAS_SERIOUS_GRADE_EXPECTED_TREATMENT_2 = "was_serious_grade_expected_treatment_2" INVARIANT_FIELDS = (Fields.TERM,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID) def __init__(self, patient_id: str): self._patient_id = patient_id @@ -37,6 +39,7 @@ def __init__(self, patient_id: str): self._outcome: str | None = None self._start_date: dt.date | None = None self._end_date: dt.date | None = None + self._sequence_id: int | None = None self._was_serious: bool | None = None self._turned_serious_date: dt.date | None = None self._related_to_treatment_1_status: RelatedStatus | None = None @@ -111,6 +114,18 @@ def end_date(self, value: dt.date | None) -> None: validator=StrictValidators.validate_optional_date, ) + @property + def sequence_id(self) -> int | None: + return self._sequence_id + + @sequence_id.setter + def sequence_id(self, value: int | None) -> None: + self._set_validated_prop( + prop=self.__class__.sequence_id, + value=value, + validator=StrictValidators.validate_optional_int, + ) + @property def was_serious(self) -> bool | None: return self._was_serious @@ -218,6 +233,7 @@ def __repr__(self, delim=",") -> str: f"outcome={self.outcome!r}{delim} " f"start_date={self.start_date!r}{delim} " f"end_date={self.end_date!r}{delim} " + f"sequence_id={self.sequence_id!r}{delim}" f"was_serious={self.was_serious!r}{delim} " f"turned_serious_date={self.turned_serious_date!r}{delim} " f"related_to_treatment_1_status={self.related_to_treatment_1_status!r}{delim} " diff --git a/src/omop_etl/harmonization/models/domain/base.py b/src/omop_etl/harmonization/models/domain/base.py index 5a07e25..ce35317 100644 --- a/src/omop_etl/harmonization/models/domain/base.py +++ b/src/omop_etl/harmonization/models/domain/base.py @@ -9,18 +9,20 @@ class DomainBase(TrackedValidated, ABC): Base class for all domain models with schema contract support. Subclasses must define: - - `class Fields:` with string constants for canonical field names (wire schema) + - `class Fields:` with string constants for canonical field names (schema from processor to domain) Subclasses may optionally define: - - INVARIANT_FIELDS` tuple referencing Fields constants for materiality (the domains' invariants) filtering + - `INVARIANT_FIELDS` tuple referencing Fields constants for materiality (the domains' invariants) filtering + - `NATURAL_KEY_FIELDS` tuple referencing Fields that make up the natural key for the domain subclass """ # internal cache, use data_fields() method to access _data_fields: ClassVar[tuple[str, ...] | None] = None _schema_validated: ClassVar[bool] = False - # optional + # collection and singleton subclasses override INVARIANT_FIELDS: ClassVar[tuple[str, ...]] = () + NATURAL_KEY_FIELDS: ClassVar[tuple[str, ...]] = () @abstractmethod def __init__(self, patient_id: str) -> None: # noqa @@ -33,6 +35,16 @@ def __init_subclass__(cls, **kwargs: Any) -> None: cls._data_fields = None cls._schema_validated = False + def natural_key(self) -> tuple: + return tuple(getattr(self, f) for f in self.NATURAL_KEY_FIELDS) + + def invariant_fields(self) -> tuple: + return tuple(getattr(self, f) for f in self.INVARIANT_FIELDS) + + def sort_key(self) -> tuple: + """None-safe sort key derived from natural_key, None values sort last.""" + return tuple((v is None, v) for v in self.natural_key()) + @classmethod def _derive_data_fields(cls) -> tuple[str, ...]: """Derive data fields from Fields inner class string constants.""" @@ -63,9 +75,15 @@ def _ensure_schema(cls) -> None: if len(fields) != len(set(fields)): raise ValueError(f"{cls.__name__}.data_fields has duplicates") + field_set = set(fields) + invariant = set(cls.INVARIANT_FIELDS) - if invariant and not invariant.issubset(set(fields)): - raise ValueError(f"{cls.__name__}.INVARIANT_FIELDS not subset of data_fields: {invariant - set(fields)}") + if invariant and not invariant.issubset(field_set): + raise ValueError(f"{cls.__name__}.INVARIANT_FIELDS not a subset of data_fields: {invariant - field_set}") + + natural_key = set(cls.NATURAL_KEY_FIELDS) + if natural_key and not natural_key.issubset(field_set): + raise ValueError(f"{cls.__name__}.NATURAL_KEY_FIELDS not a subset of data_fields: {natural_key - field_set}") # validate every Fields value matches an actual property on the class fields_cls = getattr(cls, "Fields", None) diff --git a/src/omop_etl/harmonization/models/domain/best_overall_response.py b/src/omop_etl/harmonization/models/domain/best_overall_response.py index 63ae62f..520fc9f 100644 --- a/src/omop_etl/harmonization/models/domain/best_overall_response.py +++ b/src/omop_etl/harmonization/models/domain/best_overall_response.py @@ -11,8 +11,6 @@ class Fields: CODE = "code" DATE = "date" - INVARIANT_FIELDS = (Fields.RESPONSE,) - def __init__(self, patient_id: str): self._patient_id = patient_id self._response: str | None = None @@ -20,6 +18,9 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.RESPONSE,) + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/biomarkers.py b/src/omop_etl/harmonization/models/domain/biomarkers.py index ca8ec50..0e212cd 100644 --- a/src/omop_etl/harmonization/models/domain/biomarkers.py +++ b/src/omop_etl/harmonization/models/domain/biomarkers.py @@ -22,6 +22,8 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def gene_and_mutation(self) -> str | None: return self._gene_and_mutation diff --git a/src/omop_etl/harmonization/models/domain/c30.py b/src/omop_etl/harmonization/models/domain/c30.py index e7d1747..7c28bb3 100644 --- a/src/omop_etl/harmonization/models/domain/c30.py +++ b/src/omop_etl/harmonization/models/domain/c30.py @@ -80,6 +80,8 @@ def __init__(self, patient_id: str): self._event_name: str | None = None # question fields default to None + NATURAL_KEY_FIELDS = (Fields.EVENT_NAME, Fields.DATE) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/clinical_benefit.py b/src/omop_etl/harmonization/models/domain/clinical_benefit.py new file mode 100644 index 0000000..be87f7d --- /dev/null +++ b/src/omop_etl/harmonization/models/domain/clinical_benefit.py @@ -0,0 +1,71 @@ +from typing import Set +import datetime as dt + +from omop_etl.harmonization.core.validators import StrictValidators +from omop_etl.harmonization.models.domain.base import DomainBase + + +class ClinicalBenefit(DomainBase): + """ + Clinical benefit assessment at a source-specific timepoint. Each patient + has at most one instace, the timepoint varies by trial (e.g. IMPRESS uses W16, + other sources may use W24). week is recorded explicitly so downstream + consumers can filter by timepoint without joining trial metadata. + """ + + class Fields: + WEEK = "week" + HAS_BENEFIT = "has_benefit" + DATE = "date" + + def __init__(self, patient_id: str): + self._patient_id = patient_id + self._week: int | None = None + self._has_benefit: bool | None = None + self._date: dt.date | None = None + self.updated_fields: Set[str] = set() + + NATURAL_KEY_FIELDS = (Fields.DATE,) + + @property + def patient_id(self) -> str: + return self._patient_id + + @property + def week(self) -> int | None: + return self._week + + @week.setter + def week(self, value: int | None) -> None: + self._set_validated_prop( + prop=self.__class__.week, + value=value, + validator=StrictValidators.validate_optional_int, + ) + + @property + def has_benefit(self) -> bool | None: + return self._has_benefit + + @has_benefit.setter + def has_benefit(self, value: bool | None) -> None: + self._set_validated_prop( + prop=self.__class__.has_benefit, + value=value, + validator=StrictValidators.validate_optional_bool, + ) + + @property + def date(self) -> dt.date | None: + return self._date + + @date.setter + def date(self, value: dt.date | None) -> None: + self._set_validated_prop( + prop=self.__class__.date, + value=value, + validator=StrictValidators.validate_optional_date, + ) + + def __repr__(self, delim=","): + return f"{self.__class__.__name__}(week={self.week!r}{delim}has_benefit={self.has_benefit!r}{delim} date={self.date!r})" diff --git a/src/omop_etl/harmonization/models/domain/concomitant_medication.py b/src/omop_etl/harmonization/models/domain/concomitant_medication.py index 9f2077d..f515e0f 100644 --- a/src/omop_etl/harmonization/models/domain/concomitant_medication.py +++ b/src/omop_etl/harmonization/models/domain/concomitant_medication.py @@ -28,6 +28,9 @@ def __init__(self, patient_id: str): self._sequence_id: int | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.MEDICATION_NAME,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/ecog_baseline.py b/src/omop_etl/harmonization/models/domain/ecog_baseline.py index 759e35b..1ef089e 100644 --- a/src/omop_etl/harmonization/models/domain/ecog_baseline.py +++ b/src/omop_etl/harmonization/models/domain/ecog_baseline.py @@ -18,6 +18,8 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/eq5d.py b/src/omop_etl/harmonization/models/domain/eq5d.py index 65f69f1..3c674e3 100644 --- a/src/omop_etl/harmonization/models/domain/eq5d.py +++ b/src/omop_etl/harmonization/models/domain/eq5d.py @@ -31,6 +31,8 @@ def __init__(self, patient_id: str): self._event_name: str | None = None self._qol_metric: int | None = None + NATURAL_KEY_FIELDS = (Fields.EVENT_NAME, Fields.DATE) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/followup.py b/src/omop_etl/harmonization/models/domain/followup.py index 7ba1aea..2e25d8c 100644 --- a/src/omop_etl/harmonization/models/domain/followup.py +++ b/src/omop_etl/harmonization/models/domain/followup.py @@ -16,6 +16,8 @@ def __init__(self, patient_id: str): self._date_lost_to_followup: dt.datetime | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE_LOST_TO_FOLLOWUP,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/medical_history.py b/src/omop_etl/harmonization/models/domain/medical_history.py index 3350c09..ce98550 100644 --- a/src/omop_etl/harmonization/models/domain/medical_history.py +++ b/src/omop_etl/harmonization/models/domain/medical_history.py @@ -17,13 +17,16 @@ class Fields: def __init__(self, patient_id: str): self._patient_id = patient_id self._term: str | None = None - self._sequence_id: int | None = None self._start_date: dt.date | None = None self._end_date: dt.date | None = None + self._sequence_id: int | None = None self._status: str | None = None self._status_code: int | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.TERM,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.SEQUENCE_ID) + @property def patient_id(self) -> str: return self._patient_id @@ -107,6 +110,7 @@ def __repr__(self, delim=",") -> str: f"seq={self.sequence_id!r}{delim} " f"start={self.start_date!r}{delim} " f"end={self.end_date!r}{delim} " + f"sequence_id={self.sequence_id!r}{delim} " f"status={self.status!r}{delim} " f"code={self.status_code!r})" ) diff --git a/src/omop_etl/harmonization/models/domain/previous_treatments.py b/src/omop_etl/harmonization/models/domain/previous_treatments.py index c9131f4..fb6958c 100644 --- a/src/omop_etl/harmonization/models/domain/previous_treatments.py +++ b/src/omop_etl/harmonization/models/domain/previous_treatments.py @@ -5,7 +5,7 @@ from omop_etl.harmonization.models.domain.base import DomainBase -class PreviousTreatments(DomainBase): +class PreviousTreatment(DomainBase): class Fields: TREATMENT = "treatment" TREATMENT_CODE = "treatment_code" @@ -24,6 +24,9 @@ def __init__(self, patient_id: str): self._additional_treatment: str | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.TREATMENT,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.TREATMENT_SEQUENCE_NUMBER) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/study_drugs.py b/src/omop_etl/harmonization/models/domain/study_drugs.py index 3d1159b..1b31434 100644 --- a/src/omop_etl/harmonization/models/domain/study_drugs.py +++ b/src/omop_etl/harmonization/models/domain/study_drugs.py @@ -1,4 +1,5 @@ from typing import Set +import datetime as dt from omop_etl.harmonization.core.validators import StrictValidators from omop_etl.harmonization.models.domain.base import DomainBase @@ -10,6 +11,7 @@ class Fields: PRIMARY_TREATMENT_DRUG_CODE = "primary_treatment_drug_code" SECONDARY_TREATMENT_DRUG = "secondary_treatment_drug" SECONDARY_TREATMENT_DRUG_CODE = "secondary_treatment_drug_code" + DATE = "date" def __init__(self, patient_id: str): self._patient_id = patient_id @@ -17,8 +19,11 @@ def __init__(self, patient_id: str): self._primary_treatment_drug_code: int | None = None self._secondary_treatment_drug: str | None = None self._secondary_treatment_drug_code: int | None = None + self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def primary_treatment_drug(self) -> str | None: return self._primary_treatment_drug @@ -67,11 +72,25 @@ def secondary_treatment_drug_code(self, value: int | None) -> None: validator=StrictValidators.validate_optional_int, ) + @property + def date(self) -> dt.date | None: + return self._date + + @date.setter + def date(self, value: dt.date | None) -> None: + self._set_validated_prop( + prop=self.__class__.date, + value=value, + validator=StrictValidators.validate_optional_date, + ) + def __repr__(self, delim=","): return ( f"{self.__class__.__name__}(" f"primary_treatment_drug={self.primary_treatment_drug!r}{delim} " - f" primary_treatment_drug_code={self.primary_treatment_drug_code!r}{delim} " - f" secondary_treatment_drug={self.secondary_treatment_drug!r}{delim} " - f" secondary_treatment_drug_code={self.secondary_treatment_drug_code!r})" + f"primary_treatment_drug_code={self.primary_treatment_drug_code!r}{delim} " + f"secondary_treatment_drug={self.secondary_treatment_drug!r}{delim} " + f"secondary_treatment_drug_code={self.secondary_treatment_drug_code!r}{delim}" + f"date={self.date!r}" + f")" ) diff --git a/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py b/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py index f34e49b..5556465 100644 --- a/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py +++ b/src/omop_etl/harmonization/models/domain/treatment_cycle_component.py @@ -29,6 +29,7 @@ class Fields: WAS_TABLET_TAKEN_TO_PRESCRIPTION_IN_PREVIOUS_CYCLE = "was_tablet_taken_to_prescription_in_previous_cycle" INVARIANT_FIELDS = (Fields.SOURCE_TREATMENT_NAME,) + NATURAL_KEY_FIELDS = (Fields.START_DATE, Fields.TREATMENT_NUMBER, Fields.CYCLE_NUMBER, Fields.COMPONENT_INDEX) def __init__(self, patient_id: str): # core diff --git a/src/omop_etl/harmonization/models/domain/tumor_assessment.py b/src/omop_etl/harmonization/models/domain/tumor_assessment.py index 9fe046b..f255f69 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_assessment.py +++ b/src/omop_etl/harmonization/models/domain/tumor_assessment.py @@ -36,6 +36,9 @@ def __init__(self, patient_id: str): self._event_id: str | None = None self.updated_fields: Set[str] = set() + INVARIANT_FIELDS = (Fields.ASSESSMENT_TYPE,) + NATURAL_KEY_FIELDS = (Fields.DATE, Fields.EVENT_ID) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py b/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py index 0070bdd..2c56034 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py +++ b/src/omop_etl/harmonization/models/domain/tumor_assessment_baseline.py @@ -26,6 +26,8 @@ def __init__(self, patient_id: str): self._off_target_lesion_measurement_date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.ASSESSMENT_DATE,) + @property def patient_id(self) -> str: return self._patient_id diff --git a/src/omop_etl/harmonization/models/domain/tumor_type.py b/src/omop_etl/harmonization/models/domain/tumor_type.py index 6606237..d74378e 100644 --- a/src/omop_etl/harmonization/models/domain/tumor_type.py +++ b/src/omop_etl/harmonization/models/domain/tumor_type.py @@ -27,6 +27,8 @@ def __init__(self, patient_id: str): self._date: dt.date | None = None self.updated_fields: Set[str] = set() + NATURAL_KEY_FIELDS = (Fields.DATE,) + @property def icd10_code(self) -> str | None: return self._icd10_code diff --git a/src/omop_etl/harmonization/models/patient.py b/src/omop_etl/harmonization/models/patient.py index 1034bbf..967a4e5 100644 --- a/src/omop_etl/harmonization/models/patient.py +++ b/src/omop_etl/harmonization/models/patient.py @@ -5,16 +5,18 @@ from omop_etl.harmonization.core.validators import StrictValidators from omop_etl.harmonization.core.track_validated import TrackedValidated, setter_name +from omop_etl.harmonization.models.domain.base import DomainBase from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.c30 import C30 +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.ecog_baseline import EcogBaseline from omop_etl.harmonization.models.domain.eq5d import EQ5D from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment import TumorAssessment @@ -22,7 +24,7 @@ from omop_etl.harmonization.models.domain.tumor_type import TumorType log = getLogger(__name__) -T = TypeVar("T") +T = TypeVar("T", bound=DomainBase) class Patient(TrackedValidated): @@ -44,7 +46,6 @@ class Scalars: HAS_ANY_ADVERSE_EVENTS = "has_any_adverse_events" NUMBER_OF_ADVERSE_EVENTS = "number_of_adverse_events" NUMBER_OF_SERIOUS_ADVERSE_EVENTS = "number_of_serious_adverse_events" - HAS_CLINICAL_BENEFIT_AT_WEEK_16 = "has_clinical_benefit_at_week_16" END_OF_TREATMENT_REASON = "end_of_treatment_reason" END_OF_TREATMENT_DATE = "end_of_treatment_date" @@ -52,6 +53,7 @@ class Singletons: TUMOR_TYPE = "tumor_type" STUDY_DRUGS = "study_drugs" BIOMARKERS = "biomarkers" + CLINICAL_BENEFIT = "clinical_benefit" LOST_TO_FOLLOWUP = "lost_to_followup" ECOG_BASELINE = "ecog_baseline" TUMOR_ASSESSMENT_BASELINE = "tumor_assessment_baseline" @@ -84,7 +86,6 @@ def __init__(self, patient_id: str, trial_id: str): self._has_any_adverse_events: bool | None = None self._number_of_adverse_events: int | None = None self._number_of_serious_adverse_events: int | None = None - self._has_clinical_benefit_at_week_16: bool | None = None self._end_of_treatment_reason: str | None = None self._end_of_treatment_date: dt.date | None = None @@ -92,6 +93,7 @@ def __init__(self, patient_id: str, trial_id: str): self._tumor_type: TumorType | None = None self._study_drugs: StudyDrugs | None = None self._biomarkers: Biomarkers | None = None + self._clinical_benefit: ClinicalBenefit | None = None self._lost_to_followup: FollowUp | None = None self._ecog_baseline: EcogBaseline | None = None self._tumor_assessment_baseline: TumorAssessmentBaseline | None = None @@ -99,7 +101,7 @@ def __init__(self, patient_id: str, trial_id: str): # collections self._medical_histories: tuple[MedicalHistory, ...] = () - self._previous_treatments: tuple[PreviousTreatments, ...] = () + self._previous_treatments: tuple[PreviousTreatment, ...] = () self._treatment_cycles: tuple[TreatmentCycleComponent, ...] = () self._concomitant_medications: tuple[ConcomitantMedication, ...] = () self._adverse_events: tuple[AdverseEvent, ...] = () @@ -264,18 +266,6 @@ def number_of_serious_adverse_events(self, value: int | None) -> None: validator=StrictValidators.validate_optional_int, ) - @property - def has_clinical_benefit_at_week_16(self) -> bool | None: - return self._has_clinical_benefit_at_week_16 - - @has_clinical_benefit_at_week_16.setter - def has_clinical_benefit_at_week_16(self, value: bool | None) -> None: - self._set_validated_prop( - prop=self.__class__.has_clinical_benefit_at_week_16, - value=value, - validator=StrictValidators.validate_optional_bool, - ) - @property def end_of_treatment_reason(self) -> str | None: return self._end_of_treatment_reason @@ -343,6 +333,20 @@ def biomarkers(self, value: Biomarkers | None) -> None: ) self.updated_fields.add(Biomarkers.__name__) + @property + def clinical_benefit(self) -> ClinicalBenefit | None: + return self._clinical_benefit + + @clinical_benefit.setter + def clinical_benefit(self, value: ClinicalBenefit | None) -> None: + self._clinical_benefit = self.validate_singleton( + value, + item_type=ClinicalBenefit, + patient_id=self._patient_id, + field_name=setter_name(self.__class__.clinical_benefit), + ) + self.updated_fields.add(ClinicalBenefit.__name__) + @property def lost_to_followup(self) -> FollowUp | None: return self._lost_to_followup @@ -412,13 +416,13 @@ def medical_histories(self, value: Sequence[MedicalHistory] | None) -> None: self.updated_fields.add(setter_name(self.__class__.medical_histories)) @property - def previous_treatments(self) -> tuple[PreviousTreatments, ...]: + def previous_treatments(self) -> tuple[PreviousTreatment, ...]: return self._previous_treatments @previous_treatments.setter - def previous_treatments(self, value: Sequence[PreviousTreatments] | None) -> None: + def previous_treatments(self, value: Sequence[PreviousTreatment] | None) -> None: self._previous_treatments = self.validate_collection( - value, item_type=PreviousTreatments, patient_id=self._patient_id, field_name=setter_name(self.__class__.previous_treatments) + value, item_type=PreviousTreatment, patient_id=self._patient_id, field_name=setter_name(self.__class__.previous_treatments) ) self.updated_fields.add(setter_name(self.__class__.previous_treatments)) @@ -565,6 +569,8 @@ def validate_collection( if existing != patient_id: raise ValueError(f"{field_name}: mismatched patient_id {existing!r} != {patient_id!r}") + # sort by domain natural_key so collections are deterministically ordered on assignment + items.sort(key=lambda x: x.sort_key()) return tuple(items) @classmethod @@ -653,7 +659,6 @@ def __repr__(self): f"number_of_serious_adverse_events={self.number_of_serious_adverse_events}{delim} " f"evaluable_for_efficacy_analysis={self.evaluable_for_efficacy_analysis}{delim} " f"treatment_start_date={self.treatment_start_date}{delim} " - f"has_clinical_benefit_at_week16={self.has_clinical_benefit_at_week_16}{delim} " f"end_of_treatment_reason={self.end_of_treatment_reason}{delim} " f"end_of_treatment_date={self.end_of_treatment_date}{delim} " # singletons @@ -661,6 +666,7 @@ def __repr__(self): f"tumor_assessment_baseline={self.tumor_assessment_baseline}{delim} " f"biomarkers={self.biomarkers}{delim} " f"ecog={self.ecog_baseline}{delim} " + f"clinical_benefit={self.clinical_benefit}{delim} " f"lost_to_followup={self.lost_to_followup}{delim} " f"best_overall_response={self.best_overall_response}{delim} " # collections diff --git a/src/omop_etl/omop/builders/base.py b/src/omop_etl/omop/builders/base.py index c38a1b0..9c158a8 100644 --- a/src/omop_etl/omop/builders/base.py +++ b/src/omop_etl/omop/builders/base.py @@ -20,6 +20,8 @@ class BuildContext: patient: Patient person_id: int visit_id_by_date: dict[dt.date, int] = field(default_factory=dict) + condition_id_by_ae_sequence_id: dict[int, int] = field(default_factory=dict) + condition_id_primary_cancer: int | None = None class OmopBuilder(ABC, Generic[T]): @@ -67,7 +69,7 @@ def build_and_populate(self, ctx: BuildContext) -> list[T]: self.populate_context(rows, ctx) return rows - def generate_row_id(self, *key_parts: str | None) -> int: + def generate_row_id(self, *key_parts: int | str | float | dt.date | None) -> int: """ Deterministic row ID from key parts, using SHA1 hashing with builder's namespace to create a reproducible 63-bit integer ID. diff --git a/src/omop_etl/omop/builders/condition_occurrence.py b/src/omop_etl/omop/builders/condition_occurrence.py index 0119399..988b352 100644 --- a/src/omop_etl/omop/builders/condition_occurrence.py +++ b/src/omop_etl/omop/builders/condition_occurrence.py @@ -1,6 +1,7 @@ from typing import ClassVar from logging import getLogger +from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.patient import Patient from omop_etl.harmonization.models.domain.tumor_type import TumorType from omop_etl.harmonization.models.domain.medical_history import MedicalHistory @@ -22,7 +23,14 @@ class ConditionOccurrenceBuilder(OmopBuilder[ConditionOccurrenceRow]): table_name: ClassVar[str] = "condition_occurrence" + def __init__(self, concepts: ConceptLookupService): + super().__init__(concepts) + self._ae_to_condition_id: dict[int, int] = {} + self._primary_cancer_condition_id: int | None = None + def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: + self._ae_to_condition_id = {} + self._primary_cancer_condition_id = None patient = ctx.patient person_id = ctx.person_id rows: list[ConditionOccurrenceRow] = [] @@ -30,7 +38,12 @@ def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: condition_type_concept_id = int(ecrf.concept_id) if ecrf else 0 if patient.tumor_type is not None: - rows.extend(self._build_tumor_type_rows(patient, person_id, patient.tumor_type, condition_type_concept_id)) + tumor_rows = self._build_tumor_type_rows(patient, person_id, patient.tumor_type, condition_type_concept_id) + if tumor_rows: + # multi-concept tumor mappings produce multiple rows: pick the + # first deterministically (collection already sorted by NK). + self._primary_cancer_condition_id = tumor_rows[0].condition_occurrence_id + rows.extend(tumor_rows) for idx, mh in enumerate(patient.medical_histories): rows.extend(self._build_medical_history_rows(patient, person_id, mh, idx, condition_type_concept_id)) @@ -40,6 +53,18 @@ def build(self, ctx: BuildContext) -> list[ConditionOccurrenceRow]: return rows + def populate_context(self, rows: list[ConditionOccurrenceRow], ctx: BuildContext) -> None: + """ + Publish two pieces of cross-builder state: + - condition_id_by_ae_sequence_id: AE.sequence_id: condition_occurrence_id, + for ObservationBuilder's was_serious / turned_serious_date FK linkage. + - condition_id_primary_cancer: condition_occurrence_id of the tumor-type + row, for MeasurementBuilder's measurement_event_id linkage on lesion-size + and biomarker rows (per oncology CDM guideline). + """ + ctx.condition_id_by_ae_sequence_id.update(self._ae_to_condition_id) + ctx.condition_id_primary_cancer = self._primary_cancer_condition_id + def _build_tumor_type_rows( self, patient: Patient, @@ -58,6 +83,7 @@ def _build_tumor_type_rows( domains={OmopDomain.CONDITION}, ) source_value = tumor.icd10_code + elif tumor.main_tumor_type: matches = self.concepts.lookup_semantic( patient.patient_id, @@ -66,6 +92,7 @@ def _build_tumor_type_rows( domains={OmopDomain.CONDITION}, ) source_value = tumor.main_tumor_type + else: log.warning("Skipping tumor type for %s: no icd10_code or main_tumor_type", patient.patient_id) return [] @@ -83,10 +110,11 @@ def _build_tumor_type_rows( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Singletons.TUMOR_TYPE, - str(concept.concept_id), + *tumor.natural_key(), + concept.concept_id, ), person_id=person_id, - condition_concept_id=int(concept.concept_id), + condition_concept_id=concept.concept_id, condition_start_date=date, condition_type_concept_id=condition_type_concept_id, condition_source_value=source_value, @@ -107,6 +135,11 @@ def _build_medical_history_rows( log.warning("Skipping medical history %d for %s: missing start_date", index, patient.patient_id) return [] + sequence_id = mh.sequence_id if mh.sequence_id else None + if not sequence_id: + log.warning("Skipping medical history for %s: missing sequence_id", patient.patient_id) + return [] + matches = self.concepts.lookup_semantic( patient.patient_id, (Patient.Collections.MEDICAL_HISTORIES, MedicalHistory.Fields.TERM), @@ -121,11 +154,11 @@ def _build_medical_history_rows( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - str(mh.sequence_id), - str(concept.concept_id), + *mh.natural_key(), + concept.concept_id, ), person_id=person_id, - condition_concept_id=int(concept.concept_id), + condition_concept_id=concept.concept_id, condition_start_date=start_date, condition_end_date=mh.end_date, condition_type_concept_id=condition_type_concept_id, @@ -144,10 +177,19 @@ def _build_adverse_event_rows( ) -> list[ConditionOccurrenceRow]: start_date = ae.start_date term = ae.term + if start_date is None: log.warning("Skipping adverse event %d for %s: missing start_date", index, patient.patient_id) return [] + sequence_id = ae.sequence_id + if sequence_id is None: + log.warning( + "Adverse event %d for %s is missing sequence_id, emitting row but no FK link will be published for observation_event_id", + index, + patient.patient_id, + ) + matches = self.concepts.lookup_semantic( patient.patient_id, (Patient.Collections.ADVERSE_EVENTS, AdverseEvent.Fields.TERM), @@ -157,17 +199,16 @@ def _build_adverse_event_rows( if not matches: return [] - return [ + ae_rows = [ ConditionOccurrenceRow( condition_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.ADVERSE_EVENTS, - term, - start_date.strftime(format="%Y%m%d"), - str(concept.concept_id), + *ae.natural_key(), + concept.concept_id, ), person_id=person_id, - condition_concept_id=int(concept.concept_id), + condition_concept_id=concept.concept_id, condition_start_date=start_date, condition_end_date=ae.end_date, condition_type_concept_id=condition_type_concept_id, @@ -175,3 +216,11 @@ def _build_adverse_event_rows( ) for concept in matches ] + + # accumulate AE.sequence_id to first emitted condition_occurrence_id + # multi-concept AE links to the first row deterministically + # AEs without sequence_id are warned above and are emitted without linkage + if sequence_id is not None: + self._ae_to_condition_id[sequence_id] = ae_rows[0].condition_occurrence_id + + return ae_rows diff --git a/src/omop_etl/omop/builders/drug_exposure.py b/src/omop_etl/omop/builders/drug_exposure.py index 3786deb..0339daf 100644 --- a/src/omop_etl/omop/builders/drug_exposure.py +++ b/src/omop_etl/omop/builders/drug_exposure.py @@ -3,7 +3,7 @@ from omop_etl.harmonization.models.patient import Patient from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.omop.builders.base import OmopBuilder, BuildContext from omop_etl.omop.models.rows import DrugExposureRow @@ -93,20 +93,14 @@ def _build_treatment_cycle_rows( quantity = cycle.iv_dose_prescribed dose_unit = cycle.iv_dose_prescribed_unit iv_route = self.concepts.lookup_structural("iv", domains={"Route"}) - route_concept_id = int(iv_route.concept_id) if iv_route else None + route_concept_id = iv_route.concept_id if iv_route else None elif cycle.cycle_type and cycle.cycle_type == "oral": quantity = cycle.oral_dose_prescribed_per_day dose_unit = cycle.oral_dose_unit oral_route = self.concepts.lookup_structural("oral", domains={"Route"}) - route_concept_id = int(oral_route.concept_id) if oral_route else None + route_concept_id = oral_route.concept_id if oral_route else None - base_row_id_parts = ( - patient.patient_id, - Patient.Collections.TREATMENT_CYCLES, - str(cycle.cycle_number), - str(cycle.treatment_number), - str(cycle.component_index), - ) + base_row_id_parts = (patient.patient_id, Patient.Collections.TREATMENT_CYCLES, *cycle.natural_key()) end_date_or_start = end_date or start_date drug_source_value = cycle.source_treatment_name or cycle.ingredient_name @@ -130,9 +124,9 @@ def _build_treatment_cycle_rows( return [ DrugExposureRow( - drug_exposure_id=self.generate_row_id(*base_row_id_parts, str(concept.concept_id)), + drug_exposure_id=self.generate_row_id(*base_row_id_parts, concept.concept_id), person_id=person_id, - drug_concept_id=int(concept.concept_id), + drug_concept_id=concept.concept_id, drug_exposure_start_date=start_date, drug_exposure_end_date=end_date_or_start, drug_type_concept_id=drug_type_concept_id, @@ -149,7 +143,7 @@ def _build_previous_treatment_main_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, drug_type_concept_id: int, ) -> list[DrugExposureRow]: @@ -160,7 +154,7 @@ def _build_previous_treatment_main_rows( end_date = prev.end_date or start_date matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), index, domains={OmopDomain.DRUG}, ) @@ -172,12 +166,12 @@ def _build_previous_treatment_main_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), - PreviousTreatments.Fields.TREATMENT, - str(concept.concept_id), + *prev.natural_key(), + PreviousTreatment.Fields.TREATMENT, + concept.concept_id, ), person_id=person_id, - drug_concept_id=int(concept.concept_id), + drug_concept_id=concept.concept_id, drug_exposure_start_date=start_date, drug_exposure_end_date=end_date, drug_type_concept_id=drug_type_concept_id, @@ -190,7 +184,7 @@ def _build_previous_treatment_additional_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, drug_type_concept_id: int, ) -> list[DrugExposureRow]: @@ -201,7 +195,7 @@ def _build_previous_treatment_additional_rows( end_date = prev.end_date or start_date matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), index, domains={OmopDomain.DRUG}, ) @@ -213,9 +207,9 @@ def _build_previous_treatment_additional_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), - PreviousTreatments.Fields.ADDITIONAL_TREATMENT, - str(concept.concept_id), + *prev.natural_key(), + PreviousTreatment.Fields.ADDITIONAL_TREATMENT, + concept.concept_id, ), person_id=person_id, drug_concept_id=int(concept.concept_id), @@ -260,7 +254,7 @@ def _build_concomitant_medication_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.CONCOMITANT_MEDICATIONS, - str(concom.sequence_id), + *concom.natural_key(), ), person_id=person_id, drug_concept_id=0, @@ -276,11 +270,11 @@ def _build_concomitant_medication_rows( drug_exposure_id=self.generate_row_id( patient.patient_id, Patient.Collections.CONCOMITANT_MEDICATIONS, - str(concom.sequence_id), - str(concept.concept_id), + *concom.natural_key(), + concept.concept_id, ), person_id=person_id, - drug_concept_id=int(concept.concept_id), + drug_concept_id=concept.concept_id, drug_exposure_start_date=start_date, drug_exposure_end_date=end_date_or_start, drug_type_concept_id=drug_type_concept_id, diff --git a/src/omop_etl/omop/builders/measurement.py b/src/omop_etl/omop/builders/measurement.py index 7559796..5ac842d 100644 --- a/src/omop_etl/omop/builders/measurement.py +++ b/src/omop_etl/omop/builders/measurement.py @@ -35,10 +35,35 @@ class MeasurementBuilder(OmopBuilder[MeasurementRow]): - value_as_number: is the numeric result where the source provides one. - visit_occurrence_id: is linked by date via ctx.visit_id_by_date (populated by the visit_occurrence builder, which must run before this). + - measurement_event_id: linked to cancer condtion from ctx.condition_id_primary_cancer + which is populated by the ConditionOccurrence builder (must run before this builder). + - meas_event_field_concept_id: links to condition occurrence field concept for the + measurement_event_id FK. """ table_name: ClassVar[str] = "measurement" + def _primary_cancer_fk(self, ctx: BuildContext) -> tuple[int | None, int | None]: + """ + Resolve (measurement_event_id, meas_event_field_concept_id) for + linking a measurement back to the patient's primary cancer + condition_occurrence row, per oncology CDM guideline. + Returns (None, None) when no primary cancer condition has been + published, raises if the cdm_field static entry is missing + (required once a primary cancer is published). + """ + event_id = ctx.condition_id_primary_cancer + if event_id is None: + return None, None + field_concept = self.concepts.lookup_static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + domains={"Metadata"}, + ) + if field_concept is None: + raise RuntimeError("Missing cdm_field mapping for condition_occurrence.condition_occurrence_id") + return event_id, field_concept.concept_id + def build(self, ctx: BuildContext) -> list[MeasurementRow]: patient = ctx.patient person_id = ctx.person_id @@ -187,13 +212,13 @@ def _build_ecog_rows( row_id = self.generate_row_id( patient.patient_id, Patient.Singletons.ECOG_BASELINE, - date.strftime(format="%Y%m%d"), + *ecog_baseline.natural_key(), ) return [ MeasurementRow( measurement_id=row_id, person_id=person_id, - measurement_concept_id=int(ecog_test.concept_id), + measurement_concept_id=ecog_test.concept_id, measurement_date=date, measurement_type_concept_id=ecrf_concept, measurement_datetime=dt.datetime(date.year, date.month, date.day), @@ -249,21 +274,25 @@ def _build_biomarker_rows( datetime_value = dt.datetime(date.year, date.month, date.day) visit_occurrence_id = ctx.visit_id_by_date.get(date) + event_id, field_concept_id = self._primary_cancer_fk(ctx) return [ MeasurementRow( measurement_id=self.generate_row_id( patient.patient_id, Patient.Singletons.BIOMARKERS, field_name, - str(concept.concept_id), + *biomarkers.natural_key(), + concept.concept_id, ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, visit_occurrence_id=visit_occurrence_id, measurement_source_value=source_value[:50], + measurement_event_id=event_id, + meas_event_field_concept_id=field_concept_id, ) for concept in matches ] @@ -300,22 +329,30 @@ def _build_tumor_assessment_baseline_rows( log.warning("No lesion_size structural concept for %s", patient.patient_id) return [] + unit = self.concepts.lookup_structural("millimeter", domains={"Unit"}) + unit_concept_id = unit.concept_id if unit else None + event_id, field_concept_id = self._primary_cancer_fk(ctx) + row_id = self.generate_row_id( patient.patient_id, Patient.Singletons.TUMOR_ASSESSMENT_BASELINE, TumorAssessmentBaseline.Fields.TARGET_LESION_SIZE, + *baseline.natural_key(), ) return [ MeasurementRow( measurement_id=row_id, person_id=person_id, - measurement_concept_id=int(lesion.concept_id), + measurement_concept_id=lesion.concept_id, measurement_date=date, measurement_datetime=dt.datetime(date.year, date.month, date.day), measurement_type_concept_id=ecrf_concept, value_as_number=float(size), + unit_concept_id=unit_concept_id, visit_occurrence_id=ctx.visit_id_by_date.get(date), measurement_source_value=str(size)[0:50], + measurement_event_id=event_id, + meas_event_field_concept_id=field_concept_id, ) ] @@ -339,6 +376,10 @@ def _build_tumor_assessment_rows( measurement_concept_id stores both scale and answer (same pattern as EQ5D), value_as_concept_id stays NULL. + If response scale is Not Evaluable, use separate branch with structural lookup + for measurement concept id and value as concept id is then the NE Meas Value response concept, + so any Meas Value concept for this lookup key means assessment was Not Evaluable. + If date is missing the instance is skipped entirely and no rows are emitted. """ date = tumor_assessments.date @@ -355,44 +396,51 @@ def _build_tumor_assessment_rows( if size is not None: lesion = self.concepts.lookup_structural("lesion_size", domains={OmopDomain.MEASUREMENTS}) if lesion is not None: + unit = self.concepts.lookup_structural("millimeter", domains={"Unit"}) + unit_concept_id = unit.concept_id if unit else None + event_id, field_concept_id = self._primary_cancer_fk(ctx) rows.append( MeasurementRow( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.TARGET_LESION_SIZE, + *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(lesion.concept_id), + measurement_concept_id=lesion.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, - value_as_number=float(size), + value_as_number=size, + unit_concept_id=unit_concept_id, visit_occurrence_id=visit_occurrence_id, measurement_source_value=str(size)[:50], + measurement_event_id=event_id, + meas_event_field_concept_id=field_concept_id, ) ) # tumor assessment response rows recist = tumor_assessments.recist_response if recist is not None: - concept = self.concepts.lookup_static("response_recist", recist, domains={OmopDomain.MEASUREMENTS}) - if concept is None: + recist_response_concept = self.concepts.lookup_static("response_recist", recist, domains={OmopDomain.MEASUREMENTS}) + recist_not_evaluable_concept = self.concepts.lookup_static("response_recist", recist, domains={OmopDomain.MEAS_VALUE}) + + if recist_response_concept is None and recist_not_evaluable_concept is None: log.warning("No response_recist mapping for %r (patient %s)", recist, patient.patient_id) - else: + + if recist_response_concept: rows.append( MeasurementRow( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.RECIST_RESPONSE, + *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=recist_response_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -401,23 +449,49 @@ def _build_tumor_assessment_rows( ) ) + if recist_not_evaluable_concept: + recist_concept = self.concepts.lookup_structural("response_recist") + if recist_concept is None: + log.warning("No structural concept found for response_recist") + else: + rows.append( + MeasurementRow( + measurement_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.TUMOR_ASSESSMENTS, + TumorAssessment.Fields.RECIST_RESPONSE, + *tumor_assessments.natural_key(), + ), + person_id=person_id, + measurement_concept_id=recist_concept.concept_id, + value_as_concept_id=recist_not_evaluable_concept.concept_id, + measurement_date=date, + measurement_datetime=datetime_value, + measurement_type_concept_id=ecrf_concept, + visit_occurrence_id=visit_occurrence_id, + measurement_source_value=recist[:50], + ) + ) + irecist = tumor_assessments.irecist_response if irecist is not None: - concept = self.concepts.lookup_static("response_irecist", irecist, domains={OmopDomain.MEASUREMENTS}) - if concept is None: + irecist_response_concept = self.concepts.lookup_static("response_irecist", irecist, domains={OmopDomain.MEASUREMENTS}) + irecist_not_evaluable_concept = self.concepts.lookup_static("response_irecist", irecist, domains={OmopDomain.MEAS_VALUE}) + + if irecist_response_concept is None and irecist_not_evaluable_concept is None: log.warning("No response_irecist mapping for %r (patient %s)", irecist, patient.patient_id) - else: + + if irecist_response_concept: rows.append( MeasurementRow( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.IRECIST_RESPONSE, + *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=irecist_response_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -426,23 +500,49 @@ def _build_tumor_assessment_rows( ) ) + if irecist_not_evaluable_concept: + irecist_concept = self.concepts.lookup_structural("response_irecist") + if irecist_concept is None: + log.warning("No structural concept found for response_irecist") + else: + rows.append( + MeasurementRow( + measurement_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.TUMOR_ASSESSMENTS, + TumorAssessment.Fields.IRECIST_RESPONSE, + *tumor_assessments.natural_key(), + ), + person_id=person_id, + measurement_concept_id=irecist_concept.concept_id, + value_as_concept_id=irecist_not_evaluable_concept.concept_id, + measurement_date=date, + measurement_datetime=datetime_value, + measurement_type_concept_id=ecrf_concept, + visit_occurrence_id=visit_occurrence_id, + measurement_source_value=irecist[:50], + ) + ) + rano = tumor_assessments.rano_response if rano is not None: - concept = self.concepts.lookup_static("response_rano", rano, domains={OmopDomain.MEASUREMENTS}) - if concept is None: + rano_response_concept = self.concepts.lookup_static("response_rano", rano, domains={OmopDomain.MEASUREMENTS}) + rano_not_evaluable_concept = self.concepts.lookup_static("response_rano", rano, domains={OmopDomain.MEAS_VALUE}) + + if rano_response_concept is None and rano_not_evaluable_concept is None: log.warning("No response_rano mapping for %r (patient %s)", rano, patient.patient_id) - else: + + if rano_response_concept: rows.append( MeasurementRow( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - str(tumor_assessments.event_id), - date.strftime(format="%Y%m%d"), TumorAssessment.Fields.RANO_RESPONSE, + *tumor_assessments.natural_key(), ), person_id=person_id, - measurement_concept_id=int(concept.concept_id), + measurement_concept_id=rano_response_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -451,6 +551,30 @@ def _build_tumor_assessment_rows( ) ) + if rano_not_evaluable_concept: + rano_concept = self.concepts.lookup_structural("response_rano") + if rano_concept is None: + log.warning("No structural concept found for response_rano") + else: + rows.append( + MeasurementRow( + measurement_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.TUMOR_ASSESSMENTS, + TumorAssessment.Fields.RANO_RESPONSE, + *tumor_assessments.natural_key(), + ), + person_id=person_id, + measurement_concept_id=rano_concept.concept_id, + value_as_concept_id=rano_not_evaluable_concept.concept_id, + measurement_date=date, + measurement_datetime=datetime_value, + measurement_type_concept_id=ecrf_concept, + visit_occurrence_id=visit_occurrence_id, + measurement_source_value=rano[:50], + ) + ) + return rows def _build_c30_rows( @@ -508,12 +632,12 @@ def _build_c30_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.C30_COLLECTION, - str(c30.event_name), - date.strftime(format="%Y%m%d"), + test_concept.concept_id, + *c30.natural_key(), f"q{n}", ), person_id=person_id, - measurement_concept_id=int(test_concept.concept_id), + measurement_concept_id=test_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -573,12 +697,11 @@ def _build_eq5d_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.EQ5D_COLLECTION, - str(eq5d.event_name), - date.strftime(format="%Y%m%d"), + *eq5d.natural_key(), f"q{n}", ), person_id=person_id, - measurement_concept_id=int(answer_concept.concept_id), + measurement_concept_id=answer_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -598,12 +721,12 @@ def _build_eq5d_rows( measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.EQ5D_COLLECTION, - str(eq5d.event_name), - date.strftime(format="%Y%m%d"), + *eq5d.natural_key(), + vas_concept.concept_id, EQ5D.Fields.QOL_METRIC, ), person_id=person_id, - measurement_concept_id=int(vas_concept.concept_id), + measurement_concept_id=vas_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -680,15 +803,16 @@ def _build_medical_history_rows( return [ MeasurementRow( + # same concept id produces multiple rows, so need concept_id and q_id in UID measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - str(mh.sequence_id), - str(m_concept.concept_id), - str(q_id), + *mh.natural_key(), + q_id, + m_concept.concept_id, ), person_id=person_id, - measurement_concept_id=int(m_concept.concept_id), + measurement_concept_id=m_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, @@ -768,16 +892,16 @@ def _build_adverse_event_rows( return [ MeasurementRow( + # same concept id produces multiple rows, so need concept_id and q_id in UID measurement_id=self.generate_row_id( patient.patient_id, Patient.Collections.ADVERSE_EVENTS, - term, - date.strftime(format="%Y%m%d"), - str(m_concept.concept_id), - str(q_id), + *ae.natural_key(), + q_id, + m_concept.concept_id, ), person_id=person_id, - measurement_concept_id=int(m_concept.concept_id), + measurement_concept_id=m_concept.concept_id, measurement_date=date, measurement_datetime=datetime_value, measurement_type_concept_id=ecrf_concept, diff --git a/src/omop_etl/omop/builders/observation.py b/src/omop_etl/omop/builders/observation.py new file mode 100644 index 0000000..240e7ac --- /dev/null +++ b/src/omop_etl/omop/builders/observation.py @@ -0,0 +1,466 @@ +import datetime as dt +from logging import getLogger +from typing import ClassVar + +from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.followup import FollowUp +from omop_etl.harmonization.models.patient import Patient +from omop_etl.omop.builders.base import BuildContext, OmopBuilder +from omop_etl.omop.models.rows import ObservationRow +from omop_etl.semantic_mapping.core.models import OmopDomain + +log = getLogger(__name__) + + +class ObservationBuilder(OmopBuilder[ObservationRow]): + """ + Builds observation rows from patient scalars, the lost-to-followup singleton, + and adverse-event-derived facts (outcome, was_serious, turned_serious_date). + All observation_concept_id domains must not be in Condition, Procedure, Drug, + Specimen, Measurement, or Device. + + + There are three patterns used: + + 1. For evaluable_for_efficacy_analysis, has_clinical_benfit_at_week_* + and end_of_treatment_reason there is no observation_concept_id, + it's set to 0. The source field name is tracked in observation_source_value, + and value_as_concept_id, value_as_string and value_source_value has + the raw and normalized source values. + + 2. For lost_to_followup the observation_concept_id is mapped, + observation_source_value has the field name, and value_as_concept_id has + the result (answer). + + 3. For AE-derived fields, AE outcome, AE was_serious and AE turned_serious_date, + the same occurs as the first two patterns, but they are linked back to the + source AE record from ConditionOccurrenceBuilder, + using FKs stored in observation_event_id and obs_event_field_concept_id, + produced by BuildContext.condition_id_by_ae_sequence_id. + + A row is only skipped when the source value or a required date is missing. + When a concept lookup misses, the row is still emitted with concept_id=0, + and the raw literal is stored in value_source_value or observation_source_value. + """ + + table_name: ClassVar[str] = "observation" + + def build(self, ctx: BuildContext) -> list[ObservationRow]: + """ + Emit observation rows for the patient. Order: scalar attributes + (evaluable, clinical_benefit, eot_reason), the lost_to_followup + singleton, then per-AE rows (outcome, was_serious, turned_serious_date). + observation_type_concept_id is the ecrf Type Concept, raises if the + structural entry is missing. + """ + patient = ctx.patient + person_id = ctx.person_id + + ecrf = self.concepts.lookup_structural("ecrf", domains={"Type Concept"}) + if ecrf is None: + raise RuntimeError("Missing ecrf concept in structural mapping") + + observation_type_concept_id = ecrf.concept_id + + rows: list[ObservationRow] = [] + rows.extend(self._build_evaluable(patient, person_id, observation_type_concept_id)) + rows.extend(self._build_clinical_benefit(patient, person_id, observation_type_concept_id)) + rows.extend(self._build_eot_reason(patient, person_id, observation_type_concept_id)) + rows.extend(self._build_lost_to_followup(patient, person_id, observation_type_concept_id)) + + for idx, ae in enumerate(patient.adverse_events): + rows.extend(self._build_ae_outcome(patient, person_id, observation_type_concept_id, ae, idx, ctx)) + rows.extend(self._build_ae_was_serious(patient, person_id, observation_type_concept_id, ae, idx, ctx)) + rows.extend(self._build_ae_turned_serious(patient, person_id, observation_type_concept_id, ae, idx, ctx)) + + return rows + + def _yes_no_concept_id(self, value: bool) -> int: + """ + Resolve True to Yes and False to No via the structural Meas Value + concepts. Returns 0 when the mapping is missing. + """ + concept = self.concepts.lookup_structural("yes" if value else "no", domains={OmopDomain.MEAS_VALUE}) + return concept.concept_id if concept else 0 + + def _bool_observation( + self, + *, + observation_id: int, + person_id: int, + field_name: str, + value: bool, + date: dt.date, + observation_type_concept_id: int, + observation_concept_id: int = 0, + observation_event_id: int | None = None, + obs_event_field_concept_id: int | None = None, + ) -> ObservationRow: + """ + Compose a boolean observation row. Standardizes the source-value + encoding for all boolean fields (evaluable, clinical_benefit, + lost_to_followup, AE was_serious) so the columns can't drift + between callsites. + """ + return ObservationRow( + observation_id=observation_id, + person_id=person_id, + observation_concept_id=observation_concept_id, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=self._yes_no_concept_id(value), + observation_source_value=field_name, + observation_source_concept_id=0, + value_source_value=str(value).lower(), + observation_event_id=observation_event_id, + obs_event_field_concept_id=obs_event_field_concept_id, + ) + + def _build_evaluable( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + """ + Unmapped source attribute: observation_concept_id = 0, + observation_source_value = field name, value_as_concept_id = Yes/No. + Dated to treatment_start_date (no clearer event date exists; the + evaluability decision is informed by treatment activity since start). + """ + value = patient.evaluable_for_efficacy_analysis + date = patient.treatment_start_date + if value is None: + return [] + if date is None: + log.warning( + "Skipping evaluable_for_efficacy_analysis for %s: missing treatment_start_date", + patient.patient_id, + ) + return [] + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Scalars.EVALUABLE_FOR_EFFICACY_ANALYSIS, + ), + person_id=person_id, + field_name=Patient.Scalars.EVALUABLE_FOR_EFFICACY_ANALYSIS, + value=value, + date=date, + observation_type_concept_id=observation_type_concept_id, + ) + ] + + def _build_clinical_benefit( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + """ + Clinical benefit at a source-specific timepoint. Read from the + ClinicalBenefit singleton, date is authoritative (no fallback). + observation_source_value encodes the week + (e.g. "has_clinical_benefit_at_week_16") so downstream queries + can filter by timepoint. + """ + cb = patient.clinical_benefit + if cb is None: + return [] + has_benefit = cb.has_benefit + date = cb.date + week = cb.week + if has_benefit is None: + return [] + if date is None: + log.warning( + "Skipping clinical_benefit for %s: ClinicalBenefit singleton has no date", + patient.patient_id, + ) + return [] + if week is None: + log.warning( + "Skipping clinical_benefit for %s: ClinicalBenefit singleton has no week", + patient.patient_id, + ) + return [] + + field_name = f"has_clinical_benefit_at_week_{week}" + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Singletons.CLINICAL_BENEFIT, + *cb.natural_key(), + ), + person_id=person_id, + field_name=field_name, + value=has_benefit, + date=date, + observation_type_concept_id=observation_type_concept_id, + ) + ] + + def _build_eot_reason( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + """ + Unmapped source attribute: observation_concept_id = 0, + observation_source_value = field name, value_as_concept_id = mapped + reason (or 0 if unmapped), value_as_string and value_source_value + preserve the raw reason text. + """ + reason = patient.end_of_treatment_reason + date = patient.end_of_treatment_date + if reason is None: + return [] + if date is None: + log.warning("Skipping end_of_treatment_reason for %s: missing end_of_treatment_date", patient.patient_id) + return [] + + concept = self.concepts.lookup_static("eot_reason", reason) + + return [ + ObservationRow( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Scalars.END_OF_TREATMENT_REASON, + ), + person_id=person_id, + observation_concept_id=0, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=concept.concept_id if concept else 0, + value_as_string=reason[:60], + observation_source_value=Patient.Scalars.END_OF_TREATMENT_REASON, + observation_source_concept_id=0, + value_source_value=reason[:50], + ) + ] + + def _build_lost_to_followup( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ) -> list[ObservationRow]: + """ + observation_concept_id is the "Lost to follow-up" concept from the + lost_to_followup static value set, falls back to 0 when the mapping is missing. + value_as_concept_id is the Yes/No concept, observation_source_value is the field name and + value_source_value carries the boolean literal. Date is date_lost_to_followup. + """ + followup = patient.lost_to_followup + if followup is None: + return [] + + value = followup.lost_to_followup + date = followup.date_lost_to_followup + if value is None: + return [] + if date is None: + log.warning("Skipping lost_to_followup for %s: missing date_lost_to_followup", patient.patient_id) + return [] + + concept = self.concepts.lookup_static(FollowUp.Fields.LOST_TO_FOLLOWUP, str(value)) + observation_concept_id = concept.concept_id if concept else 0 + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Singletons.LOST_TO_FOLLOWUP, + *followup.natural_key(), + ), + person_id=person_id, + field_name=Patient.Singletons.LOST_TO_FOLLOWUP, + value=value, + date=date, + observation_type_concept_id=observation_type_concept_id, + observation_concept_id=observation_concept_id, + ) + ] + + def _build_ae_outcome( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ae: AdverseEvent, + index: int, + ctx: BuildContext, + ) -> list[ObservationRow]: + """ + Topic concept is the structural lookup for adverse_event_outcome, + answer concept is the static lookup for adverse_event_outcome values. + Both lookups fall back to 0 when missing and the row is still emitted as + long as outcome and start_date are present, with the raw value + preserved in value_source_value, linked to Condition AE record. + """ + raw_outcome = ae.outcome + date = ae.start_date + if raw_outcome is None: + return [] + if date is None: + log.warning("Skipping AE %d outcome for %s: missing start_date", index, patient.patient_id) + return [] + + topic_concept = self.concepts.lookup_structural("adverse_event_outcome") + outcome_concept = self.concepts.lookup_static("adverse_event_outcome", raw_outcome) + + event_id, field_concept_id = self._ae_fk(ae, patient, index, ctx) + + return [ + ObservationRow( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.ADVERSE_EVENTS, + *ae.natural_key(), + AdverseEvent.Fields.OUTCOME, + ), + person_id=person_id, + observation_concept_id=topic_concept.concept_id if topic_concept else 0, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=outcome_concept.concept_id if outcome_concept else 0, + observation_source_value=AdverseEvent.Fields.OUTCOME, + observation_source_concept_id=0, + value_source_value=str(raw_outcome)[:50], + observation_event_id=event_id, + obs_event_field_concept_id=field_concept_id, + ) + ] + + def _build_ae_was_serious( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ae: AdverseEvent, + index: int, + ctx: BuildContext, + ) -> list[ObservationRow]: + """ + Unmapped source attribute and AE FK: observation_concept_id = 0, + observation_source_value = "was_serious", value_as_concept_id = Yes/No concept, + observation_event_id and obs_event_field_concept_id point at the + AE's condition_occurrence row. Emits for both True and False so the + explicit assessment is preserved. Dated is AE.start_date. + """ + was_serious = ae.was_serious + if was_serious is None: + return [] + date = ae.start_date + if date is None: + log.warning("Skipping AE %d was_serious for %s: missing start_date", index, patient.patient_id) + return [] + + event_id, field_concept_id = self._ae_fk(ae, patient, index, ctx) + + return [ + self._bool_observation( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.ADVERSE_EVENTS, + *ae.natural_key(), + AdverseEvent.Fields.WAS_SERIOUS, + ), + person_id=person_id, + field_name=AdverseEvent.Fields.WAS_SERIOUS, + value=was_serious, + date=date, + observation_type_concept_id=observation_type_concept_id, + observation_event_id=event_id, + obs_event_field_concept_id=field_concept_id, + ) + ] + + def _build_ae_turned_serious( + self, + patient: Patient, + person_id: int, + observation_type_concept_id: int, + ae: AdverseEvent, + index: int, + ctx: BuildContext, + ) -> list[ObservationRow]: + """ + AE turned-serious flag. Encoded as a Yes observation on + turned_serious_date, value_source_value carries the ISO date so + consumers can reconstruct the event without re-querying. + Not using _bool_observation because value_source_value differs + (date string, not "true"). + """ + date = ae.turned_serious_date + if date is None: + return [] + + event_id, field_concept_id = self._ae_fk(ae, patient, index, ctx) + + return [ + ObservationRow( + observation_id=self.generate_row_id( + patient.patient_id, + Patient.Collections.ADVERSE_EVENTS, + *ae.natural_key(), + AdverseEvent.Fields.TURNED_SERIOUS_DATE, + ), + person_id=person_id, + observation_concept_id=0, + observation_date=date, + observation_type_concept_id=observation_type_concept_id, + value_as_concept_id=self._yes_no_concept_id(True), + observation_source_value=AdverseEvent.Fields.TURNED_SERIOUS_DATE, + observation_source_concept_id=0, + value_source_value=date.isoformat(), + observation_event_id=event_id, + obs_event_field_concept_id=field_concept_id, + ) + ] + + def _ae_fk( + self, + ae: AdverseEvent, + patient: Patient, + index: int, + ctx: BuildContext, + ) -> tuple[int | None, int | None]: + """ + Resolve (observation_event_id, obs_event_field_concept_id) for an + AE-derived observation row. Returns (None, None) when the AE has no + sequence_id or no published condition_occurrence row. Raises if the + `cdm_field` static entry for condition_occurrence.condition_occurrence_id + is missing, this is required for AE-attributed observations. + """ + sequence_id = ae.sequence_id + if sequence_id is None: + log.warning( + "AE %d for %s missing sequence_id: cannot link observation to condition_occurrence", + index, + patient.patient_id, + ) + return None, None + + event_id = ctx.condition_id_by_ae_sequence_id.get(sequence_id) + if event_id is None: + log.warning( + "AE %d for %s missing event_id: cannot link observation to condition_occurrence", + index, + patient.patient_id, + ) + return None, None + + field_concept = self.concepts.lookup_static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + domains={"Metadata"}, + ) + if field_concept is None: + raise RuntimeError("Missing cdm_field mapping for condition_occurrence.condition_occurrence_id") + + return event_id, field_concept.concept_id diff --git a/src/omop_etl/omop/builders/procedure_occurrence.py b/src/omop_etl/omop/builders/procedure_occurrence.py index 396d947..cccd855 100644 --- a/src/omop_etl/omop/builders/procedure_occurrence.py +++ b/src/omop_etl/omop/builders/procedure_occurrence.py @@ -2,7 +2,7 @@ from logging import getLogger from omop_etl.harmonization.models.patient import Patient -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.medical_history import MedicalHistory from omop_etl.omop.builders.base import OmopBuilder, BuildContext from omop_etl.omop.models.rows import ProcedureOccurrenceRow @@ -46,7 +46,7 @@ def _build_previous_treatment_main_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, procedure_type_concept_id: int, ) -> list[ProcedureOccurrenceRow]: @@ -56,7 +56,7 @@ def _build_previous_treatment_main_rows( matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), index, domains={OmopDomain.PROCEDURE}, ) @@ -68,12 +68,12 @@ def _build_previous_treatment_main_rows( procedure_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), - PreviousTreatments.Fields.TREATMENT, - str(concept.concept_id), + PreviousTreatment.Fields.TREATMENT, + *prev.natural_key(), + concept.concept_id, ), person_id=person_id, - procedure_concept_id=int(concept.concept_id), + procedure_concept_id=concept.concept_id, procedure_date=start_date, procedure_end_date=prev.end_date, procedure_type_concept_id=procedure_type_concept_id, @@ -86,7 +86,7 @@ def _build_previous_treatment_additional_rows( self, patient: Patient, person_id: int, - prev: PreviousTreatments, + prev: PreviousTreatment, index: int, procedure_type_concept_id: int, ) -> list[ProcedureOccurrenceRow]: @@ -96,7 +96,7 @@ def _build_previous_treatment_additional_rows( matches = self.concepts.lookup_semantic( patient.patient_id, - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), index, domains={OmopDomain.PROCEDURE}, ) @@ -108,12 +108,12 @@ def _build_previous_treatment_additional_rows( procedure_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.PREVIOUS_TREATMENTS, - str(prev.treatment_sequence_number), - PreviousTreatments.Fields.ADDITIONAL_TREATMENT, - str(concept.concept_id), + *prev.natural_key(), + PreviousTreatment.Fields.ADDITIONAL_TREATMENT, + concept.concept_id, ), person_id=person_id, - procedure_concept_id=int(concept.concept_id), + procedure_concept_id=concept.concept_id, procedure_date=start_date, procedure_end_date=prev.end_date, procedure_type_concept_id=procedure_type_concept_id, @@ -148,11 +148,11 @@ def _build_medical_history_rows( procedure_occurrence_id=self.generate_row_id( patient.patient_id, Patient.Collections.MEDICAL_HISTORIES, - str(mh.sequence_id), - str(concept.concept_id), + *mh.natural_key(), + concept.concept_id, ), person_id=person_id, - procedure_concept_id=int(concept.concept_id), + procedure_concept_id=concept.concept_id, procedure_date=start_date, procedure_end_date=mh.end_date, procedure_type_concept_id=procedure_type_concept_id, diff --git a/src/omop_etl/omop/builders/visit_occurrence.py b/src/omop_etl/omop/builders/visit_occurrence.py index b89628b..9c8742e 100644 --- a/src/omop_etl/omop/builders/visit_occurrence.py +++ b/src/omop_etl/omop/builders/visit_occurrence.py @@ -93,7 +93,7 @@ def _build_baseline_row( row_id = self.generate_row_id( patient.patient_id, Patient.Singletons.TUMOR_ASSESSMENT_BASELINE, - date.strftime(format="%Y%m%d"), + *baseline.natural_key(), ) return VisitOccurrenceRow( @@ -120,7 +120,7 @@ def _build_assessment_row( row_id = self.generate_row_id( patient.patient_id, Patient.Collections.TUMOR_ASSESSMENTS, - date.strftime(format="%Y%m%d"), + *assessment.natural_key(), ) return VisitOccurrenceRow( diff --git a/src/omop_etl/omop/models/rows.py b/src/omop_etl/omop/models/rows.py index 41af420..cd3bf8c 100644 --- a/src/omop_etl/omop/models/rows.py +++ b/src/omop_etl/omop/models/rows.py @@ -243,3 +243,31 @@ class MeasurementRow: def validate(self): validate_required_fields(self) + + +@pd_dataclass(frozen=True, slots=True) +class ObservationRow: + observation_id: int + person_id: int + observation_concept_id: int + observation_date: dt.date + observation_type_concept_id: int + observation_datetime: dt.datetime | None = None + value_as_number: float | None = None + value_as_string: Annotated[str | None, pd_field(max_length=60)] = None + value_as_concept_id: int | None = None + qualifier_concept_id: int | None = None + unit_concept_id: int | None = None + provider_id: int | None = None + visit_occurrence_id: int | None = None + visit_detail_id: int | None = None + observation_source_value: Annotated[str | None, pd_field(max_length=50)] = None + observation_source_concept_id: int | None = None + unit_source_value: Annotated[str | None, pd_field(max_length=50)] = None + qualifier_source_value: Annotated[str | None, pd_field(max_length=50)] = None + value_source_value: Annotated[str | None, pd_field(max_length=50)] = None + observation_event_id: int | None = None + obs_event_field_concept_id: int | None = None + + def validate(self): + validate_required_fields(self) diff --git a/src/omop_etl/omop/service.py b/src/omop_etl/omop/service.py index e6b570b..bcb6c09 100644 --- a/src/omop_etl/omop/service.py +++ b/src/omop_etl/omop/service.py @@ -5,6 +5,7 @@ from omop_etl.omop.builders.base import OmopBuilder, BuildContext from omop_etl.omop.builders.condition_occurrence import ConditionOccurrenceBuilder from omop_etl.omop.builders.measurement import MeasurementBuilder +from omop_etl.omop.builders.observation import ObservationBuilder from omop_etl.omop.builders.person import PersonBuilder from omop_etl.omop.builders.observation_period import ObservationPeriodBuilder from omop_etl.omop.builders.cdm_source import CdmSourceBuilder @@ -37,6 +38,7 @@ def __init__(self, concepts: ConceptLookupService): ConditionOccurrenceBuilder(concepts), ProcedureOccurrenceBuilder(concepts), MeasurementBuilder(concepts), + ObservationBuilder(concepts), ] def build(self, patients: Sequence[Patient]) -> OmopTables: diff --git a/src/omop_etl/resources/cohort_normalization/harmonized_target_biomarkers.csv b/src/omop_etl/resources/cohort_normalization/harmonized_target_biomarkers.csv new file mode 100644 index 0000000..6f80d2b --- /dev/null +++ b/src/omop_etl/resources/cohort_normalization/harmonized_target_biomarkers.csv @@ -0,0 +1,172 @@ +source_biomarker_name,frequency,harmonized_biomarker_name,comment +4q12 amplicon,1,4q12, +9p24.1 amplicon,1,9p24, +A775_G776INSYVMA,1,ERBB2, +ALK fusion,3,ALK fusion, +ALK FUSION IGFBP5,1,ALK fusion, +ALK fusions,10,ALK fusion, +ALK mut,1,ALK mut, +ALK translocation,1,ALK translocation,"Might be a fusion, the trial must confirm" +ATM-ATR,14,ATM-ATR, +BRAF,10,BRAFV600, +BRAF fusion,5,BRAF fusion, +BRAF fusions,1,BRAF fusion, +BRAF non-V600,2,BRAFnonV600, +BRAF Non-V600mut,7,BRAFnonV600, +BRAF nonV600 (BRAF D594G),1,BRAFnonV600, +BRAF nonV600E,1,BRAFnonV600, +BRAF nonV600E (15exon),1,BRAFnonV600, +BRAF nonV600E (BRAF G469A),2,BRAFnonV600, +BRAF V600,14,BRAFV600, +BRAF V600 (3) + BRAF non-V600 (4),1,BRAFV600, +BRAF V600 activating mutations,3,BRAFV600, +BRAF v600E,1,BRAFV600, +BRAF V600E,10,BRAFV600, +BRAF V600E activating mutations,20,BRAFV600, +BRAF V600Emut,14,BRAFV600, +BRAF V600G,1,BRAFV600, +BRAF VAL600 GLU,1,BRAFV600, +BRCA,11,BRCA12, +BRCA1 2 biallelic inactivation,1,BRCA12, +BRCA2 biallelic loss,2,BRCA12, +C.2313_2324DUP P.(TYR772_ALA775DUP),1,ERBB2, +Constitutional mismatch repair deficiency disposition (CMMRD),1,CMMRD, +CMMrd,,CMMRD, +Double hit in PIK3CA/PTEN,1,PIK3CA, +Drug screen,1,Drug screen,"The trial must elaborate: might be organoid-based drug screening and not target-drug-match. Think we should keep them separated from other cohorts." +EBB2 amplification,12,ERBB2 ampl, +EGFR amp,2,EGFR ampl, +EGFR mut,1,EGFR mut, +ERBB2 - L755P,1,ERBB2 mut, +ERBB2 (HER2) Amplification,22,ERBB2 ampl, +ERBB2 (HER2) G776V mutation,1,ERBB2 mut, +ERBB2 (HER2) R678Q mutation,1,ERBB2 mut, +ERBB2 (HER2) S310F mutation,2,ERBB2 mut, +ERBB2 (HER2) S310F MUTATION AND HER2 AMPLIFICATION,1,ERBB2 mut/ampl, +ERBB2 (HER2) S310Y mutation,1,ERBB2 mut, +ERBB2 A775_G776INSYVMA,1,ERBB2 INSYVMA, +ERBB2 C. 2313_2324DUP P.(TYR 772_ALA775DUP) VARIANT DETECTED,1,ERBB2, +ERBB2 C.2313_2324DUP ; P.(TYR772_ALA775DUP) 16%,1,ERBB2, +ERBB2 EXON 20 ACTIVATING VARIANT C.2262_2269DELINSCCCGA P.(LEU755_GLU757DELINSPROLYS),1,ERBB2, +ERBB2 EXON 20 MUTATION C.2326_2327INSTTGTGATGGCTG P.ALA775_GLY776INSVALVALMETALA).,1,ERBB2, +ERBB2 G776>VC,1,ERBB2, +ERBB2 GLY766DELINSVALCYS,1,ERBB2, +ERBB2 mutation,4,ERBB2 mut, +ERBB2 MUTATION A775_G776INSYVMA,1,ERBB2 mut, +ERBB2 MUTATION G778S AND AMPLIFICATION,1,ERBB2 mut/ampl, +ERBB2 P.(GLY776DELINSVALCYS),1,ERBB2, +ERBB2 P.(ILE767MET),1,ERBB2, +ERBB2 P.Y772 A775DUP (EXON 20),1,ERBB2, +ERBB2 P.Y772_A775DUP,1,ERBB2, +ERBB2 TYR772_ALA775DUP,1,ERBB2, +"ERBB2, P.Y772_A775DUP",1,ERBB2, +ERBB2. C2313_2324DUP P.(TYR772_ALA775DUP),1,ERBB2, +ERBB2ampl,10,ERBB2 ampl, +ERBB2mut,3,ERBB2 mut, +ERBB3 mutation,1,ERBB3 mut, +EXON 20 ERB B2 INSERTION (C.2313_2324DUP P.(TYR772_ALA775DUP).PDL1 10%,1,ERBB2, +FGFampl,6,FGF ampl, +FGFR1 ampl,3,FGFR ampl, +FGFR1 amplification,1,FGFR ampl, +FGFR1 double hit,1,FGFR mut/fusion, +FGFR1 fusion,1,FGFR mut/fusion, +FGFR1 fusions,1,FGFR mut/fusion, +FGFR1 mut,1,FGFR mut/fusion, +FGFR2 fusion,2,FGFR mut/fusion, +FGFR2 mut,6,FGFR mut/fusion, +FGFR3 fusion,4,FGFR mut/fusion, +FGFR3 mut,2,FGFR mut/fusion, +FGFR3mut,1,FGFR mut/fusion, +FGFRampl,6,FGFR ampl, +FGFRfusion,8,FGFR mut/fusion, +FGFRmut,9,FGFR mut/fusion, +FRFR2 fusion (ATE1fusion),1,FGFR mut/fusion, +GNA11,1,GNA11, +GNA11 mut,1,GNA11 mut, +GNAQ mut,1,GNAQ mut, +GNAS mut,2,GNAS mut, +HER2 amp,7,ERBB2 ampl, +HER2 ampl,11,ERBB2 ampl, +HER2 ampl/mut,2,ERBB2 mut/ampl, +HER2 AMPLIFICATION (COPY NUMBER 84) AND POINT MUTATION S310Y,1,ERBB2 mut/ampl, +HER2 EXON 20 INSERTION MUTATION,1,ERBB2 mut, +HER2 mut,11,ERBB2 mut, +HER2 postive,1,ERBB2 overexprression, +HER2amp,6,ERBB2 ampl, +HER2exp,2,ERBB2 overexprression, +HER2mut,5,ERBB2 mut, +High tumour mutational burden (TMB),20,TMB high, +HRAS G12D,1,HRAS mut, +HRAS mut,1,HRAS mut, +HRD,11,HRD, +HRR alterations,4,HRR, +HRR defiency,1,HRR, +Hypocellular AML,1,AML, +KRAS (G12V),1,KRAS mut, +KRAS G12D,1,KRAS mut, +KRAS G12S,1,KRAS mut, +KRAS G12V,1,KRAS mut, +KRAS mut,1,KRAS mut, +KRAS mut (G12V and G12D),1,KRAS mut, +"KRAS, NRAS, BRAF",1,RAS-RAF-pathway,"RAS/RAF-pathway, may keep them separated from the KRAS-cohorts" +LTK high,1,LTK high, +MAP2,1,MAP2, +MAP2K1 (MEK1),1,MAP2K1, +"MAP2K1 (MEK1), MAP2K2 (MEK2) or NRAS",1,"MAP2K1, MAP2K2, NRAS", +MAP2K4 loss_mut,1,MAP2K4 mut, +MAP2K4 mut,1,MAP2K4 mut, +MAP3K1 loss_mut,1,MAP3K1 mut, +MAP3K1 mut,1,MAP3K1 mut, +MET amp,3,MET ampl, +MET amplification,7,MET ampl, +MET deletions EXON 14,1,MET exon 14 skip, +MET Exon 14,1,MET exon 14 skip, +MET fusion,5,MET fusion, +Microsatellite instability high (MSI),2,MSI high, +MSI-high,8,MSI high, +MSI-high incl. res,4,MSI high, +MSI-high incl.res,1,MSI high, +MSIhigh,8,MSI high, +NF1,1,NF1, +NF1 amplification,1,NF1 ampl, +NF1 loss,1,NF1, +NF1 mutation,7,NF1 mut, +NF1loss_mut,1,NF1 mut, +NF1mut,1,NF1 mut, +NPM1 mut AML,1,NPM1 mut, +NRAS amp,2,NRAS ampl, +NRAS mut,8,NRAS mut, +NRAS mutation,3,NRAS mut, +NRAS mutation (Q61R),1,NRAS mut, +Other (AN IN-FRAME INSERTION WITHIN ERBB2 EXON 20),1,ERBB2, +Other (ERBB2 (HER2)),1,ERBB2, +Other (ERBB2 EXON 20),1,ERBB2 mut, +Other (ERBB2C.2313_2324DUP),1,ERBB2 mut, +PBRM1,1,PBRM1, +PD-L1,2,PD-L1, +PDGFRA,3,PDGFRA, +PIK3CA DoubleHit incl res,1,PIK3CA, +PIK3CA mut,10,PIK3CA mut, +PIK3CA mut/ampl,1,PIK3CA mut/ampl, +PIK3CAmut,2,PIK3CA mut, +PIK3R1 mut,1,PIK3R1 mut, +PIK3R2 mut,1,PIK3R2 mut, +POLE mut,1,POLE mut, +PTEN loss_mut,3,PTEN loss/mut, +PTENloss,5,PTEN loss, +PTENloss/mut,1,PTEN loss/mut, +ROS1 fusion,1,ROS1 fusion, +SHH-pathway,6,SHH-pathway, +STRN-ALK FUSION,1,ALK fusion, +TMB,5,TMB high, +TMB >= 16,1,TMB high, +TMB >=16,17,TMB high, +TMB >=16 eval MRI,1,TMB high, +TMB >=16 incl. res,1,TMB high, +TMB >=16 incl.res,1,TMB high, +TMB >=16 TMZ ind,3,TMB high, +TMB >=16 TMZ ind incl res,1,TMB high, +TMB>=16,2,TMB high, +TMBhigh,16,TMB high, +TML >=140,2,TML, +V600 MUTATION,1,BRAFV600, \ No newline at end of file diff --git a/src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv b/src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv new file mode 100644 index 0000000..4e13596 --- /dev/null +++ b/src/omop_etl/resources/cohort_normalization/harmonized_tumor_types.csv @@ -0,0 +1,215 @@ +source_tumor_type_name,frequency,harmonized_tumor_type_name,tumor_subtype,general_tumor_type,comment +"79F, METASTATIC CARCINOMA OF ANUS, HER2 AMPLIFIED",1,Anal cancer,,Anal cancer +acinic cell carsinoma,1,Acinic cell carcinoma,Acinic cell carcinoma,Salivary gland cancer +Adenoic cystic carcinoma,1,Adenoid cystic carcinoma,,Adenoid cystic carcinoma +Adrenocortical carcinoma,1,Adrenocortical carcinoma,,Adrenocortical carcinoma +ALK-POSITIVE LARGE B-CELL LYMPHOMA,1,Large B-cell lymphoma,Large B-cell lymphoma,Haematological cancer +ameloblastoma,1,Ameloblastoma,,Ameloblastoma +Ameloblastoma,1,Ameloblastoma,,Ameloblastoma +AML,1,AML,AML,Haematological cancer +anal,1,Anal cancer,,Anal cancer +anal canal squamous cell ca,1,Anal cancer,,Anal cancer +Anal cancer,1,Anal cancer,,Anal cancer +Anal carcinoma,2,Anal cancer,,Anal cancer +ANAPLASTIC ALK-POSITIVE LARGE CELL LYMPHOMA,1,Anaplastic large cell lymphoma,Anaplastic large cell lymphoma,Haematological cancer +Anaplastic large cell lymphoma,2,Anaplastic large cell lymphoma,Anaplastic large cell lymphoma,Haematological cancer +anaplastic thyroid cancer,1,Anaplastic thyroid cancer,Anaplastic thyroid cancer,Thyroid cancer +Apocrine carcinoma,1,Apocrine carcinoma,,Apocrine carcinoma +Astrocytoma,1,Astrocytoma,Astrocytoma,CNS +bile duct & gall bladder,1,Cholangiocarcinoma,,Cholangiocarcinoma +bile duct & gallbladder,5,Cholangiocarcinoma,,Cholangiocarcinoma +bile duct and gallbladder,1,Cholangiocarcinoma,,Cholangiocarcinoma +Biliary tract carcinoma,2,Cholangiocarcinoma,,Cholangiocarcinoma +Biliary tract carcinoma/galbladder carcinoma,1,Cholangiocarcinoma,,Cholangiocarcinoma +bladder & urinary tract,6,Urothelial cancer,,Urothelial cancer +Bladder / urinary tract cancer,2,Urothelial cancer,,Urothelial cancer +Bladder cancer,3,Urothelial cancer,,Urothelial cancer +Bladder/Urinary Tract Cancer,2,Urothelial cancer,,Urothelial cancer +brain ependymoma,1,Ependymoma,Ependymoma,CNS +breast,6,Breast cancer,,Breast cancer +Breast cancer,9,Breast cancer,,Breast cancer +Breast Cancer,1,Breast cancer,,Breast cancer +"breast, non-TNBC",1,Breast cancer,,Breast cancer +"breast, TNBC",1,Breast cancer,,Breast cancer +Central Nervous System/Brain,4,CNS,,CNS +cervical cancer,2,Cervical cancer,Cervical cancer,Gynaecological cancer +Cervical cancer,5,Cervical cancer,Cervical cancer,Gynaecological cancer +Cervical Cancer,2,Cervical cancer,Cervical cancer,Gynaecological cancer +cervix,1,Cervical cancer,Cervical cancer,Gynaecological cancer +Cholangio carcinoma,5,Cholangiocarcinoma,,Cholangiocarcinoma +Cholangiocarcicoma,1,Cholangiocarcinoma,,Cholangiocarcinoma +Cholangiocarcinoma,16,Cholangiocarcinoma,,Cholangiocarcinoma +chordoma,1,Chordoma,,Chordoma +CLEAR CELL ADENOCARCINOMA OF THE LEFT URETER,1,Urothelial cancer,,Urothelial cancer +Clear cell odontogenic carcinoma,1,Clear cell odontogenic carcinoma,,Clear cell odontogenic carcinoma +CLL,1,CLL,CLL,Haematological cancer +CNS tumor,1,CNS,,CNS +Colorectal Cancer,5,Colorectal cancer,Colorectal cancer,Intestinal cancer +conjuctival melanoma (eye),1,Conjunctival melanoma,Conjunctival melanoma,Ocular melanoma +CRC,21,Colorectal cancer,Colorectal cancer,Intestinal cancer +CRPC,2,Prostate cancer,,Prostate cancer +CUP,11,CUP,,CUP +Duodenal carcinoma,1,Duodenal carcinoma,Duodenal carcinoma,Intestinal cancer +Eccrine carcinoma,1,Eccrine carcinoma,,Eccrine carcinoma +endometrial,2,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Endometrial caccer,1,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Endometrial cancer,6,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Endometrial/Uterine Cancer,2,Endometrial cancer,Endometrial cancer,Gynaecological cancer +endometroid ovary cancer,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Erdheim Chester,1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +Erdheim Chester / Histiocytic,1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +"ERDHEIM CHESTER DISEASE - HISTIOCYTOSIS",1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +"ERDHEIM CHESTER DISEASE (ORBITAL, CARDIAC AND BONE INVOLVEMENT)",1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +"ERDHEIM CHESTER DISEASE, BRAF V600E MUTATION",1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +esophageal,1,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Esophageal,1,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Esophageal cancer,2,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Esophageal_Gastric cancer,1,Esophageal or gastric cancer,Esophageal or gastric cancer,Upper GI-cancer +esophagus,3,Esophageal cancer,Esophageal cancer,Upper GI-cancer +esophagus ca,1,Esophageal cancer,Esophageal cancer,Upper GI-cancer +Fallopian tube cancer,1,Fallopian tube cancer,Fallopian tube cancer,Gynaecological cancer +Fallopian Tube Cancer,1,Fallopian tube cancer,Fallopian tube cancer,Gynaecological cancer +FALLOPIAN TUBE CANCER,1,Fallopian tube cancer,Fallopian tube cancer,Gynaecological cancer +gallbladder ca,1,Cholangiocarcinoma,,Cholangiocarcinoma +gastric,1,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +Gastric cancer,1,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +Gastroesophageal Cancer,3,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +GBM,6,Glioblastoma,Glioblastoma,CNS +GEJ,5,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +GEJ carcinoma,1,Gastroesophageal junction cancer,Gastroesophageal junction cancer,Upper GI-cancer +Germ cell cancer,1,Germ cell tumor,,Germ cell tumor +germ cell tumor,2,Germ cell tumor,,Germ cell tumor +gist,1,GIST,,GIST +GIST,2,GIST,,GIST +glioblastoma,1,Glioblastoma,Glioblastoma,CNS +Glioblastoma Multiforme,1,Glioblastoma,Glioblastoma,CNS +glioma (high grade),2,High-grade glioma,High-grade glioma,CNS +Glioneuronal_Neuronal Tumor,1,Glioneuronal tumor,Glioneuronal tumor,CNS +Goblet cell carcinoma,1,Goblet cell carcinoma,Goblet cell carcinoma,Intestinal cancer +Grade 3 glioma,1,Grade 3 glioma,Grade 3 glioma,CNS +grade II glioma,1,Grade 2 glioma,Grade 2 glioma,CNS +grade III glioma,3,Grade 3 glioma,Grade 3 glioma,CNS +Gynaecological tumors,2,Gynaecological cancer,,Gynaecological cancer +Hairy cell leukaemia,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +HAIRY CELL LEUKAEMIA,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +Hairy cell leukemia,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +HAIRY CELL LEUKEMIA,1,Hairy cell leukemia,Hairy cell leukemia,Haematological cancer +HCC,1,Hepatocellular carcinoma,,Hepatocellular carcinoma +head & neck,4,Head and neck cancer,,Head and neck cancer +Head and Neck Cancer,5,Head and neck cancer,,Head and neck cancer +Hepatobiliary Cancer,6,Cholangiocarcinoma,,Cholangiocarcinoma +Hidradenocarcinoma,1,Hidradenocarcinoma,,Hidradenocarcinoma +High grade glioma,11,High-grade glioma,High-grade glioma,CNS +HIGH GRADE GLIOMA,1,High-grade glioma,High-grade glioma,CNS +high grade serose ovary ca,4,Ovarian cancer,Ovarian cancer,Gynaecological cancer +HNnonSCC,1,Head and neck cancer,,Head and neck cancer +HNSCC,3,Head and neck cancer,,Head and neck cancer +intestinal,5,Intestinal cancer,,Intestinal cancer +intestinal adecoca,1,Intestinal cancer,,Intestinal cancer +Kidney Cancer,1,Kidney cancer,,Kidney cancer +Langerhans Cell Histiocytosis,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +LANGERHANS CELL HISTIOCYTOSIS,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +LANGERHANS CELL HISTO HISTIOCYTOSIS,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +Langerhans cells histocytosis,1,Langerhans cell histiocytosis,Langerhans cell histiocytosis,Histiocytosis +LGSOC,2,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Low grade glioma,1,Low-grade glioma,Low-grade glioma,CNS +low grade ovarian serose ca,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +lung ca,4,Lung cancer,,Lung cancer +Lymphoma,1,Lymphoma,Lymphoma,Haematological cancer +Malignant Pleural Mesothelioma,1,Mesothelioma,,Mesothelioma +MALIGNANT TUMOUR OF UNKNOWN ORIGIN,1,CUP,,CUP +MDS,2,MDS,MDS,Haematological cancer +Melanocytic tumor,1,Melanocytic tumor,,Melanocytic tumor +melanoma,3,Melanoma,Melanoma,Skin cancer +Melanoma,3,Melanoma,Melanoma,Skin cancer +mesothelioma,1,Mesothelioma,,Mesothelioma +Metastatic adenocarcinoma of left parotid,1,Parotid gland carcinoma,Parotid gland carcinoma,Salivary gland cancer +METASTATIC ADENOCARCINOMA OF LIKELY PANCREATIC ORIGIN,1,Pancreatic cancer,,Pancreatic cancer +METASTATIC AMPULLARY ADENOCARCINOMA,1,Ampullary cancer,Ampullary cancer,Intestinal cancer +METASTATIC ANAL CANCER,1,Anal cancer,,Anal cancer +METASTATIC BASALOID PSEUDO-CRIBIFORM EPITHELOID CARCINOMA,1,Epitheloid carcinoma,,Epitheloid carcinoma +METASTATIC LUNG ADENOCARCINOMA,1,Lung cancer,,Lung cancer +METASTATIC PERITONEAL MESOTHELIOMA,1,Mesothelioma,,Mesothelioma +metastatic ventricular ca,1,Ventricular cancer,Ventricular cancer,CNS +MM,1,Multiple Myeloma,Multiple Myeloma,Haematological cancer +MPM,1,Mesothelioma,,Mesothelioma +Multifocal glioma,1,Glioma,Glioma,CNS +NEC,11,Neuro-endocrine carcinoma,,Neuro-endocrine carcinoma +NEC (gastrointestinal),1,Neuro-endocrine carcinoma,,Neuro-endocrine carcinoma +NET,7,Neuro-endocrine tumor,,Neuro-endocrine tumor +Neuroendocrine and Adrenal Tumour,2,Neuro-endocrine tumor,,Neuro-endocrine tumor +NON-HODGKIN'S LYMPHOMA,1,Non-Hodgkin lymphoma,Non-Hodgkin lymphoma,Haematological cancer +NON-LANGERHANS HISTIOCYTIC DISORDER (ERDHEIM-CHESTER DISEASE),1,Erdheim-Chester Disease,Erdheim-Chester Disease,Histiocytosis +Non-Small Cell Lung Cancer,27,NSCLC,NSCLC,Lung cancer +NSCLC,24,NSCLC,NSCLC,Lung cancer +NUD,1,CUP,,CUP +Occult Primary or Cancer of Unknown Primary,2,CUP,,CUP +ORAL CAVITY SQUAMOUS CARCINOMA,1,Head and neck cancer,,Head and neck cancer +ovarian,7,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovarian ca,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovarian cancer,12,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovarian Cancer,2,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovarian mucinous carsinoma,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovario ca,1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovario high grade clear cell carcinoma (HGCC),1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +Ovary high grade serose cancer (HGSC),1,Ovarian cancer,Ovarian cancer,Gynaecological cancer +ovary low grade serous cancer,3,Ovarian cancer,Ovarian cancer,Gynaecological cancer +pancreas,5,Pancreatic cancer,,Pancreatic cancer +pancreatic cancer,2,Pancreatic cancer,,Pancreatic cancer +Pancreatic cancer,8,Pancreatic cancer,,Pancreatic cancer +Pancreatic Cancer,3,Pancreatic cancer,,Pancreatic cancer +Papillary craniofaryngeoma,1,Papillary craniofaryngeoma,Papillary craniofaryngeoma,CNS +paraganglioma,1,Paraganglioma,Paraganglioma,CNS +penile,2,Penile cancer,,Penile cancer +Penile cancer,1,Penile cancer,,Penile cancer +peritoneal mesothelioma,1,Mesothelioma,,Mesothelioma +PERITONEAL MESOTHELIOMA,1,Mesothelioma,,Mesothelioma +pleural mesothelioma,1,Mesothelioma,,Mesothelioma +Primary brain tumors,2,CNS,,CNS +prostate,4,Prostate cancer,,Prostate cancer +Prostate cancer,10,Prostate cancer,,Prostate cancer +Prostate Cancer,4,Prostate cancer,,Prostate cancer +Prostate carcinoma,1,Prostate cancer,,Prostate cancer +PTO,1,CUP,,CUP +RCC,2,Renal cell carcinoma,,Renal cell carcinoma +RELAPSED METASTATIC SALIVARY DUCT CARCINOMA (SDC),1,Salivary duct carcinoma,Salivary duct carcinoma,Salivary gland cancer +Renal Cell Carcinoma,1,Renal cell carcinoma,,Renal cell carcinoma +salivary duct ca,1,Salivary duct carcinoma,Salivary duct carcinoma,Salivary gland cancer +Salivary duct carcinoma,2,Salivary duct carcinoma,Salivary duct carcinoma,Salivary gland cancer +salivary gland,2,Salivary gland cancer,,Salivary gland cancer +Salivary gland adenocarcinoma,1,Salivary gland cancer,,Salivary gland cancer +Salivary gland cancer,7,Salivary gland cancer,,Salivary gland cancer +Salivary gland carcinoma,2,Salivary gland cancer,,Salivary gland cancer +sarcoma,1,Sarcoma,,Sarcoma +Sarcoma,4,Sarcoma,,Sarcoma +SCLC,3,SCLC,SCLC,Lung cancer +sigma adenocarcinoma (2xquick PD),1,Left-sided adenocarcinoma,,Left-sided adenocarcinoma +Skin cancer,2,Skin cancer,,Skin cancer +Small Bowel Carcinoma,3,Small intestine cancer,Small intestine cancer,Intestinal cancer +Small intestine cancer,4,Small intestine cancer,Small intestine cancer,Intestinal cancer +soft tissue sarcoma,1,Soft tissue sarcoma,Soft tissue sarcoma,Sarcoma +stomach adenocarsinoma,1,Gastric cancer,Gastric cancer,Upper GI-cancer +submandibular cland adenocystic carsinoma,1,Adenoid cystic carcinoma,,Adenoid cystic carcinoma +synovial sarcoca,1,Synovial sarcoma,Synovial sarcoma,Sarcoma +thymic,1,Thymoma,,Thymoma +thyroid,1,Thyroid cancer,,Thyroid cancer +thyroid cancer,2,Thyroid cancer,,Thyroid cancer +Thyroid cancer,3,Thyroid cancer,,Thyroid cancer +Thyroid Cancer,4,Thyroid cancer,,Thyroid cancer +Thyroid carcinoma,1,Thyroid cancer,,Thyroid cancer +tongue ca,1,Tongue cancer,,Tongue cancer +Tumor-agnostic,14,Tumor-agnostic,,Tumor-agnostic +Tumor agnostic,12,Tumor-agnostic,,Tumor-agnostic +UCC,1,Urothelial cancer,,Urothelial cancer +unknown primary adenoid cystic carsinoma,1,Adenoid cystic carcinoma,,Adenoid cystic carcinoma +Upper GI-tumors,1,Upper GI-cancer,,Upper GI-cancer +Upper rectal adenocarcinoma,1,Colorectal cancer,Colorectal cancer,Intestinal cancer +Urothelial cancer,3,Urothelial cancer,,Urothelial cancer +Uterine cancer,1,Uterine cancer,Uterine cancer,Gynaecological cancer +uterus endometrial adenocarcinoma,1,Endometrial cancer,Endometrial cancer,Gynaecological cancer +Uveal melanoma,2,Uveal melanoma,Uveal melanoma,Ocular melanoma +vaginal adeno carsinoma,2,Vaginal cancer,Vaginal cancer,Gynaecological cancer +Vaginal cancer,1,Vaginal cancer,Vaginal cancer,Gynaecological cancer +Vaginal Cancer,1,Vaginal cancer,Vaginal cancer,Gynaecological cancer +Vulvar Cancer,1,Vulvar cancer,Vulvar cancer,Gynaecological cancer +Vulvarian cancer,1,Vulvar cancer,Vulvar cancer,Gynaecological cancer diff --git a/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv b/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv index 922ff2b..cc45b7e 100644 --- a/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv +++ b/src/omop_etl/resources/semantic_mapped/braf_non-v600_mapped.csv @@ -170,10 +170,10 @@ c0d469b8-2d83-5ef8-a060-3cb5b0f2eef2,CM_CMTRT,Ibuprofen,1,1177480,5640,ibuprofen 98cea1ff-2517-56c3-af61-bb37270630e4,CM_CMTRT,Kaliumklorid,1,19049105,8591,potassium chloride,Ingredient,Standard,Valid,Drug,RxNorm Extension 4000804f-b2bb-52b8-859e-8747b20d8124,CM_CMTRT,Fiasp (Insulin),1,1567198,51428,"insulin aspart, human",Ingredient,Standard,Valid,Drug,RxNorm fc4de2f6-c822-542e-8392-53cf03842961,CM_CMTRT,Cimetidin,1,997276,2541,cimetidine,Ingredient,Standard,Valid,Drug,RxNorm -9261aa9a-b652-5d50-b109-b5c92b058ae2,COH_COHCTN,BRAF Non-V600 activating mutations,12,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC +9261aa9a-b652-5d50-b109-b5c92b058ae2,COH_COHCTN,c,12,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC a683d456-51bb-52f1-8dcb-b88cfe79add6,COH_COHCTN,BRAF Non-V600activating mutations,1,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC 8d9cba99-2d8f-5254-8226-cc57d315adc7,COH_COHTMN,BRAF Non-V600 activating mutations,13,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC -4bf9b6d6-5f7e-5c8e-b424-16b4885158df,COH_GENMUT1,BRAF activating mutations,11,13,3039156,53844-7,"BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test",Standard,Valid,Measurement,LOINC +4bf9b6d6-5f7e-5c8e-b424-16b4885158df,COH_GENMUT1,BRAF activating mutations,13,3039156,53844-7,BRAF gene targeted mutation analysis in Blood or Tissue by Molecular genetics method,Lab Test,Standard,Valid,Measurement,LOINC 2e020752-c463-5744-9859-8dd3e864d528,COH_GENMUT1,Other,2,4205432,55446002,Genetic mutation,Clinical Finding,Standard,Valid,Condition,SNOMED 16634bc9-9c93-5cd8-b149-34d0894a2e4b,COH_COHTT,Pancreatic cancer,3,4180793,363418001,Malignant tumor of pancreas,Disorder,Standard,Valid,Condition,SNOMED 88b3fa87-2b35-5bee-adf1-39ed0e10c828,COH_COHTT,Cholangiocarcinoma,3,4208660,312104005,Cholangiocarcinoma of biliary tract,Disorder,Standard,Valid,Condition,SNOMED diff --git a/src/omop_etl/resources/static_mapped/static_mapping.csv b/src/omop_etl/resources/static_mapped/static_mapping.csv index 25c37ec..8930eb5 100644 --- a/src/omop_etl/resources/static_mapped/static_mapping.csv +++ b/src/omop_etl/resources/static_mapped/static_mapping.csv @@ -1,5 +1,6 @@ value_set,local_value,omop_concept_id,omop_concept_code,omop_concept_name,omop_concept_class,omop_standard_concept,omop_validity,omop_domain,omop_vocab lost_to_followup,True,4163894,399307001,Lost to follow-up,Clinical Finding,Standard,Valid,Observation,SNOMED +cdm_field,condition_occurrence.condition_occurrence_id,1147127,CDM183,condition_occurrence.condition_occurrence_id,Field,Standard,Valid,Metadata,CDM sex,f,8532,F,FEMALE,Gender,Standard,Valid,Gender,Gender sex,m,8507,M,MALE,Gender,Standard,Valid,Gender,Gender sex,female,8532,F,FEMALE,Gender,Standard,Valid,Gender,Gender @@ -21,15 +22,24 @@ ecog_code,5,45880868,LA9627-6,Dead,Answer,Standard,Valid,Meas Value,LOINC tumor_assessment_baseline,iRecist 1.1,734318,iRECIST,iRECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier tumor_assessment_baseline,RECIST 1.1,734317,RECIST,RECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier tumor_assessment_baseline,RANO (for Glioblastoma),734345,RANO,RANO finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier +response_recist,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_recist,Not evaluated,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_recist,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_recist,Stable Disease (SD),1634680,1.1_RECIST-SD,RECIST 1.1: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Progressive Disease (PD),1633597,1.1_RECIST-PD,RECIST 1.1: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Complete Response (CR),1634772,1.1_RECIST-CR,RECIST 1.1: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_recist,Partial Response (PR),1633368,1.1_RECIST-PR,RECIST 1.1: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier -response_irecist,iComplete Response (CR),1633954,iRECIST-CR,iRECIST: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier -response_irecist,iPartial Response (PR),1635284,iRECIST-PR,iRECIST: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier +response_irecist,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_irecist,Not evaluated,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_irecist,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_irecist,iComplete Response,1633954,iRECIST-CR,iRECIST: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier +response_irecist,iPartial Response,1635284,iRECIST-PR,iRECIST: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iStable disease,1635887,iRECIST-SD,iRECIST: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iConfirmed progressive disease,1633423,iRECIST-PD,iRECIST: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_irecist,iUnconfirmed progressive disease,1633423,iRECIST-PD,iRECIST: progressive disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier +response_rano,Not evaluable,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_rano,Not evaluated,45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC +response_rano,Not evaluable (NE),45878793,LA4479-7,Status Not Evaluable,Answer,Standard,Valid,Meas Value,LOINC response_rano,Complete Response (CR),1634853,RANO-CR,RANO: complete response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_rano,Partial Response (PR),1634574,RANO-PR,RANO: partial response,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier response_rano,Stable Disease (SD),1633447,RANO-SD,RANO: stable disease,Staging/Grading,Standard,Valid,Measurement,Cancer Modifier @@ -122,16 +132,16 @@ eq5d_q5_answer_code,3,742368,OMOP5181578,EuroQol five dimension five level anxie eq5d_q5_answer_code,4,742369,OMOP5181579,EuroQol five dimension five level anxiety depression score: 4 (I am severely anxious or depressed),Precoordinated pair,Standard,Valid,Measurement,OMOP Extension eq5d_q5_answer_code,5,742370,OMOP5181580,EuroQol five dimension five level anxiety depression score: 5 (I am extremely anxious or depressed),Precoordinated pair,Standard,Valid,Measurement,OMOP Extension eot_reason,Disease progression,1617595,97509-4,Cancer disease progression,Clinical Observation,Standard,Valid,Observation,LOINC -eot_reason,Normal completion according to cohort-specific manual,45884335,LA4511-7,Treatment Completed,Answer,Standard,Valid,Meas Value,LOINC -eot_reason,Other,35821954,100418-5,Other reason,Answer,Standard,Valid,Observation,UK Biobank -eot_reason,Adverse event/Toxicity,441207,62014003,Adverse reaction to drug,Disorder,Standard,Valid,Observation,SNOMED +eot_reason,Normal completion according to cohort-specific manual,44788181,15501000000100,Completed successfully,Qualifier Value,Standard,Valid,Observation,SNOMED +eot_reason,Other,9177,74964007,Other,Qualifier Value,Standard,Valid,Meas Value,SNOMED +eot_reason,Adverse event/Toxicity,45884383,LA7266-5,Adverse event,Answer,Standard,Valid,Meas Value,LOINC eot_reason,Symptomatic deterioration,4111347,285384003,General health deterioration,Clinical Finding,Standard,Valid,Observation,SNOMED -eot_reason,Patient refusal,45773084,703427001,Refusal of treatment by patient against dental advice,Context-dependent,Standard,Valid,Observation,SNOMED +eot_reason,Patient refusal,45878680,LA4389-8,Refused,Answer,Standard,Valid,Meas Value,LOINC eot_reason,Withdrawn by PI,44810920,871401000000109,Withdrawn from research study,Clinical Finding,Standard,Valid,Observation,SNOMED eot_reason_code,2,1617595,97509-4,Cancer disease progression,Clinical Observation,Standard,Valid,Observation,LOINC -eot_reason_code,7,45884335,LA4511-7,Treatment Completed,Answer,Standard,Valid,Meas Value,LOINC -eot_reason_code,88,35821954,100418-5,Other reason,Answer,Standard,Valid,Observation,UK Biobank -eot_reason_code,1,441207,62014003,Adverse reaction to drug,Disorder,Standard,Valid,Observation,SNOMED +eot_reason_code,7,44788181,15501000000100,Completed successfully,Qualifier Value,Standard,Valid,Observation,SNOMED +eot_reason_code,88,9177,74964007,Other,Qualifier Value,Standard,Valid,Meas Value,SNOMED +eot_reason_code,1,45884383,LA7266-5,Adverse event,Answer,Standard,Valid,Meas Value,LOINC eot_reason_code,6,4111347,285384003,General health deterioration,Clinical Finding,Standard,Valid,Observation,SNOMED -eot_reason_code,3,45773084,703427001,Refusal of treatment by patient against dental advice,Context-dependent,Standard,Valid,Observation,SNOMED +eot_reason_code,3,45878680,LA4389-8,Refused,Answer,Standard,Valid,Meas Value,LOINC eot_reason_code,5,44810920,871401000000109,Withdrawn from research study,Clinical Finding,Standard,Valid,Observation,SNOMED \ No newline at end of file diff --git a/src/omop_etl/resources/static_mapped/structural_mapping.csv b/src/omop_etl/resources/static_mapped/structural_mapping.csv index db3bb2b..1378cdd 100644 --- a/src/omop_etl/resources/static_mapped/structural_mapping.csv +++ b/src/omop_etl/resources/static_mapped/structural_mapping.csv @@ -1,7 +1,14 @@ value_set,omop_concept_id,omop_concept_code,omop_concept_name,omop_concept_class,omop_standard_concept,omop_validity,omop_domain,omop_vocab +no,4188540,373067005,No,Qualifier Value,Standard,Valid,Meas Value,SNOMED +yes,4188539,373066001,Yes,Qualifier Value,Standard,Valid,Meas Value,SNOMED +adverse_event_outcome,4231813,405533003,Adverse incident outcome,Clinical Finding,Standard,Valid,Observation,SNOMED ecog,36305384,89247-1,ECOG Performance Status score,Clinical Observation,Standard,Valid,Measurement,LOINC -lesion_size,4084390,246116008,Lesion size,Observable Entity,Standard,Valid,Measurement,SNOMED +lesion_size,36768664,OMOP4998340,Dimension of Tumor,Dimension,Standard,Valid,Measurement,Cancer Modifier +millimeter,8588,mm,millimeter,Unit,Standard,Valid,Unit,UCUM number_of_lesions,4085855,246206008,Number of lesions,Observable Entity,Standard,Valid,Observation,SNOMED +response_recist,734317,RECIST,RECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier +response_irecist,734318,iRECIST,iRECIST finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier +response_rano,734345,RANO,RANO finding,Staging/Grading,Classification,Valid,Measurement,Cancer Modifier c30_q1,701340,OMOP5117524,"EORTC Quality of Life Questionnaire: Do you have any trouble doing strenuous activities, like carrying a heavy shopping bag or a suitcase?",Staging / Scales,Standard,Valid,Measurement,OMOP Extension c30_q2,701341,OMOP5117525,EORTC Quality of Life Questionnaire: Do you have any trouble taking a long walk?,Staging / Scales,Standard,Valid,Measurement,OMOP Extension c30_q3,701342,OMOP5117526,EORTC Quality of Life Questionnaire: Do you have any trouble taking a short walk outside of the house?,Staging / Scales,Standard,Valid,Measurement,OMOP Extension diff --git a/src/omop_etl/semantic_mapping/core/semantic_config.py b/src/omop_etl/semantic_mapping/core/semantic_config.py index d66ebcc..b35880a 100644 --- a/src/omop_etl/semantic_mapping/core/semantic_config.py +++ b/src/omop_etl/semantic_mapping/core/semantic_config.py @@ -4,7 +4,7 @@ from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_type import TumorType @@ -33,13 +33,13 @@ # previous treatments FieldConfig( name="previous_treatments.treatment", - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), target=QueryTarget(domains={OmopDomain.PROCEDURE, OmopDomain.DRUG}), tags={"previous_treatments", "term"}, ), FieldConfig( name="previous_treatments.additional_treatment", - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), target=QueryTarget(domains={OmopDomain.PROCEDURE, OmopDomain.DRUG}), tags={"previous_treatments", "additional_term"}, ), diff --git a/tests/concept_mapping/test_service.py b/tests/concept_mapping/test_service.py index 94ad9e4..5a8d272 100644 --- a/tests/concept_mapping/test_service.py +++ b/tests/concept_mapping/test_service.py @@ -116,14 +116,25 @@ def test_domain_filter_is_case_insensitive(self, static_index): result = service.lookup_static("sex", "M", domains={domain_filter}) assert result is not None, f"filter {domain_filter!r} should match" - def test_domain_filter_miss_is_case_insensitive(self, static_index): - """Wrong-domain filter misses regardless of case.""" + def test_domain_filter_rejects_regardless_of_case(self, static_index): + """Wrong-domain filter returns None regardless of case.""" service = ConceptLookupService(static_index=static_index) - result = service.lookup_static("sex", "M", domains={"Procedure"}) + for domain_filter in ("procedure", "PROCEDURE", "Procedure"): + result = service.lookup_static("sex", "M", domains={domain_filter}) + assert result is None, f"filter {domain_filter!r} should reject" + + def test_filter_reject_is_not_recorded_as_miss(self, static_index): + """ + An entry exists in the index but the requested filter rejects it. + This is a caller-side flow event, a builder is asking the wrong domain question, + and not a data-quality gap, so it must not be recorded in the missed-lookup log. + """ + service = ConceptLookupService(static_index=static_index) - assert result is None - assert len(service.result.missed["static"]) == 1 + service.lookup_static("sex", "M", domains={"Procedure"}) + + assert len(service.result.missed["static"]) == 0 def test_vocab_filter_matches(self, static_index): service = ConceptLookupService(static_index=static_index) diff --git a/tests/harmonization/conftest.py b/tests/harmonization/conftest.py index b9a85ec..d8308a0 100644 --- a/tests/harmonization/conftest.py +++ b/tests/harmonization/conftest.py @@ -212,6 +212,7 @@ class StudyDrugsRow: COH_COHALLO2__2CD: str | None = None COH_COHALLO2__3: str | None = None COH_COHALLO2__3CD: str | None = None + COH_EventDate: str | None = None @pytest.fixture @@ -223,6 +224,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1__2CD="31", COH_COHALLO2__2="Tafinlar", COH_COHALLO2__2CD="10", + COH_EventDate="2021-06-01", ), StudyDrugsRow( "sd1_match_sd2_match", @@ -230,6 +232,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1CD="99", COH_COHALLO2="some drug 2", COH_COHALLO2CD="1", + COH_EventDate="2022-06-01", ), StudyDrugsRow( "sd1_mismatch1_sd2_mismatch1_2", @@ -237,6 +240,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1CD="10", COH_COHALLO2__2="mismatch_1_2", COH_COHALLO2__2CD="12", + COH_EventDate="2023-06-01", ), StudyDrugsRow( "sd1_mismatch2_sd2_mismatch2_1", @@ -244,6 +248,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO1__2CD="50", COH_COHALLO2="mismatch_2_1", COH_COHALLO2CD="60", + COH_EventDate="2024-06-01", ), StudyDrugsRow( "sd_collision", @@ -253,6 +258,7 @@ def study_drugs_fixture() -> pl.DataFrame: COH_COHALLO2__2CD="5", COH_COHALLO2__3="some_drug_3_2", COH_COHALLO2__3CD="999", + COH_EventDate="2025-06-01", ), ] @@ -1459,6 +1465,7 @@ class AdverseEventRow: FU_FUPDEDAT: str | None = None TR_TRNAME: str | None = None TR_TRTNO: int | None = None + AE_AESPID: int | None = None @pytest.fixture @@ -1484,6 +1491,7 @@ def adverse_events_fixture() -> pl.DataFrame: AE_AETRT2="Drug B", TR_TRNAME="Regimen X", TR_TRTNO="1", # type: ignore + AE_AESPID=1, ), AdverseEventRow( "serious_fill_end_from_death", @@ -1499,6 +1507,7 @@ def adverse_events_fixture() -> pl.DataFrame: FU_FUPDEDAT="1900-02-01", TR_TRNAME="Regimen Y", TR_TRTNO=2, + AE_AESPID=1, ), AdverseEventRow( "multi", @@ -1509,6 +1518,7 @@ def adverse_events_fixture() -> pl.DataFrame: AE_AESERCD=0, AE_AEREL1CD=2, AE_AEREL2CD=4, + AE_AESPID=1, ), AdverseEventRow( "multi", @@ -1519,6 +1529,7 @@ def adverse_events_fixture() -> pl.DataFrame: AE_SAEEXP1CD=2, AE_SAEEXP2CD=1, AE_AEREL2CD=1, + AE_AESPID=2, ), ] @@ -1695,42 +1706,50 @@ class ClinicalBenefitRow: RNRSP_RNRSPCLCD: int | None = None RNRSP_EventId: str | None = None RA_EventId: str | None = None + RA_EventDate: str | None = None + RNRSP_EventDate: str | None = None @pytest.fixture -def has_clinical_benefit_at_week_16_fixture() -> pl.DataFrame: +def clinical_benefit_fixture() -> pl.DataFrame: rows: List[ClinicalBenefitRow] = [ ClinicalBenefitRow( "recist_le3", RA_RATIMRESCD=3, RA_EventId="V03", + RA_EventDate="2023-04-01", ), ClinicalBenefitRow( "recist_gt3", RA_RATIMRESCD=4, RA_EventId="V03", + RA_EventDate="2023-04-02", ), ClinicalBenefitRow( "irecist_le3", RA_RAiMODCD=2, RA_EventId="V03", + RA_EventDate="2023-04-03", ), ClinicalBenefitRow( "rano_le3", RNRSP_RNRSPCLCD=3, RNRSP_EventId="V03", + RNRSP_EventDate="2023-04-04", ), ClinicalBenefitRow( "both_present", RA_RATIMRESCD=4, RA_RAiMODCD=3, RA_EventId="V03", + RA_EventDate="2023-04-05", ), ClinicalBenefitRow("v03_no_codes", RA_EventId="V03"), ClinicalBenefitRow( "not_v03", RA_RATIMRESCD=2, RA_EventId="V02", + RA_EventDate="2023-04-06", ), ] diff --git a/tests/harmonization/harmonizers/test_base.py b/tests/harmonization/harmonizers/test_base.py index 067be89..8006e1b 100644 --- a/tests/harmonization/harmonizers/test_base.py +++ b/tests/harmonization/harmonizers/test_base.py @@ -491,6 +491,80 @@ def builder(pid, row): assert [i.name for i in all_built] == ["a", "b"] +class TestNaturalKeyConflictDetection: + """Uses MedicalHistory (NATURAL_KEY_FIELDS = (start_date, sequence_id)) as the test domain.""" + + def _mh_row(self, *, start_date, sequence_id, term="hypertension", end_date=None, status=None, status_code=None): # noqa + return { + "term": term, + "sequence_id": sequence_id, + "start_date": start_date, + "end_date": end_date, + "status": status, + "status_code": status_code, + } + + def _packed(self, items): # noqa + return pl.DataFrame({"SubjectId": ["p1"], "items": [items]}) + + def test_identical_duplicates_pass_silently(self, caplog): + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [ + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1), + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1), + ] + with caplog.at_level("WARNING", logger="omop_etl.harmonization.harmonizers.base"): + BaseHarmonizer.hydrate_collection_field( + self._packed(items), + item_type=MedicalHistory, + patients=patients, + ) + assert not any("natural-key conflict" in r.message for r in caplog.records) + + def test_conflicting_data_logs_warning_by_default(self, caplog): + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [ + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="hypertension"), + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="diabetes"), + ] + with caplog.at_level("WARNING", logger="omop_etl.harmonization.harmonizers.base"): + BaseHarmonizer.hydrate_collection_field( + self._packed(items), + item_type=MedicalHistory, + patients=patients, + ) + warnings = [r for r in caplog.records if "natural-key conflict" in r.message] + assert len(warnings) == 1 + assert "p1" in warnings[0].message + assert "term" in warnings[0].message + + def test_conflicting_data_raises_under_error_policy(self): + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [ + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="hypertension"), + self._mh_row(start_date=dt.date(2024, 1, 1), sequence_id=1, term="diabetes"), + ] + with pytest.raises(ValueError, match="natural-key conflict"): + BaseHarmonizer.hydrate_collection_field( + self._packed(items), + item_type=MedicalHistory, + patients=patients, + on_natural_key_conflict="error", + ) + + def test_empty_natural_key_skips_check(self, mock_simple_domain_attr, caplog): + """Domains without NATURAL_KEY_FIELDS bypass the conflict check entirely.""" + patients = {"p1": Patient(patient_id="p1", trial_id="test")} + items = [{"name": "a", "value": 1}, {"name": "a", "value": 2}] + with caplog.at_level("WARNING", logger="omop_etl.harmonization.harmonizers.base"): + BaseHarmonizer.hydrate_collection_field( + pl.DataFrame({"SubjectId": ["p1"], "items": [items]}), + item_type=SimpleDomain, + patients=patients, + ) + assert not any("natural-key conflict" in r.message for r in caplog.records) + + class TestHydrateScalar: def test_sets_scalar_attribute(self): """Scalar value should be set on patient.""" diff --git a/tests/harmonization/harmonizers/test_impress.py b/tests/harmonization/harmonizers/test_impress.py index ce94787..39fb88f 100644 --- a/tests/harmonization/harmonizers/test_impress.py +++ b/tests/harmonization/harmonizers/test_impress.py @@ -750,9 +750,9 @@ def test_returns_expected_columns(self, previous_treatment_fixture): df = h._process_previous_treatments() assert df is not None - from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments + from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment - expected_cols = {"SubjectId"} | set(PreviousTreatments.data_fields()) + expected_cols = {"SubjectId"} | set(PreviousTreatment.data_fields()) assert set(df.columns) == expected_cols def test_extracts_treatment_values(self, previous_treatment_fixture): @@ -1581,35 +1581,45 @@ def test_irecist_ne_maps_to_96(self, best_overall_response_fixture): assert row.item(0, "date") == dt.date(1900, 4, 1) -class TestProcessClinicalBenefitAtWeek16: - def test_returns_expected_columns(self, has_clinical_benefit_at_week_16_fixture): - h = ImpressHarmonizer(data=has_clinical_benefit_at_week_16_fixture, trial_id="T") - df = h._process_has_clinical_benefit_at_week_16() +class TestProcessClinicalBenefit: + def test_returns_expected_columns(self, clinical_benefit_fixture): + h = ImpressHarmonizer(data=clinical_benefit_fixture, trial_id="T") + df = h._process_clinical_benefit() assert df is not None - assert "SubjectId" in df.columns - assert "has_clinical_benefit_at_week_16" in df.columns + assert df.columns == ["SubjectId", "week", "has_benefit", "date"] + + def test_week_is_16_for_impress(self, clinical_benefit_fixture): + h = ImpressHarmonizer(data=clinical_benefit_fixture, trial_id="T") + df = h._process_clinical_benefit() + assert df is not None + assert df["week"].unique().to_list() == [16] @pytest.mark.parametrize( - "sid, expected", + "sid, expected_benefit, expected_date", [ - pytest.param("recist_le3", True, id="single criterion: RECIST <=3"), - pytest.param("recist_gt3", False, id="single criterion: RECIST >3"), - pytest.param("irecist_le3", True, id="single criterion: iRECIST <=3"), - pytest.param("rano_le3", True, id="single criterion: RANO <=3"), - pytest.param("both_present", True, id="multi criterion present"), - pytest.param("v03_no_codes", False, id="V03 visit but no benefit codes"), - pytest.param("not_v03", None, id="non-V03 visit -> filtered out"), + pytest.param("recist_le3", True, dt.date(2023, 4, 1), id="RECIST <=3: RA_EventDate"), + pytest.param("recist_gt3", False, dt.date(2023, 4, 2), id="RECIST >3: fallback to RA_EventDate"), + pytest.param("irecist_le3", True, dt.date(2023, 4, 3), id="iRECIST <=3: RA_EventDate"), + pytest.param("rano_le3", True, dt.date(2023, 4, 4), id="RANO <=3: RNRSP_EventDate"), + pytest.param("both_present", True, dt.date(2023, 4, 5), id="multi criterion present: RA_EventDate"), + pytest.param("v03_no_codes", False, None, id="V03 visit but no benefit codes, no dates"), + pytest.param("not_v03", None, None, id="non-V03 visit: filtered out"), ], ) - def test_clinical_benefit__at_week_16_values(self, has_clinical_benefit_at_week_16_fixture, sid, expected): - h = ImpressHarmonizer(data=has_clinical_benefit_at_week_16_fixture, trial_id="T") - df = h._process_has_clinical_benefit_at_week_16() + def test_clinical_benefit_values_and_dates(self, clinical_benefit_fixture, sid, expected_benefit, expected_date): + h = ImpressHarmonizer(data=clinical_benefit_fixture, trial_id="T") + df = h._process_clinical_benefit() assert df is not None row = df.filter(pl.col("SubjectId") == sid) - actual = None if row.height == 0 else row.item(0, "has_clinical_benefit_at_week_16") - assert actual is expected + if expected_benefit is None: + assert row.height == 0 + return + actual_benefit = row.item(0, "has_benefit") + actual_date = row.item(0, "date") + assert actual_benefit is expected_benefit + assert actual_date == expected_date class TestProcessEotReason: @@ -1676,13 +1686,13 @@ class TestImpressSpecContracts: "treatment_start_last_cycle": "last_treatment_start_fixture", "treatment_start_date": "treatment_start_fixture", "evaluable_for_efficacy_analysis": "evaluability_fixture", - "has_clinical_benefit_at_week_16": "has_clinical_benefit_at_week_16_fixture", "end_of_treatment_reason": "end_of_treatment_reason_fixture", "end_of_treatment_date": "treatment_stop_fixture", # singletons "tumor_type": "tumor_type_fixture", "study_drugs": "study_drugs_fixture", "biomarkers": "biomarkers_fixture", + "clinical_benefit": "clinical_benefit_fixture", "lost_to_followup": "lost_to_followup_fixture", "ecog_baseline": "ecog_fixture", "baseline_tumor_assessment": "baseline_tumor_assessment_fixture", diff --git a/tests/harmonization/models/test_collection_sorting.py b/tests/harmonization/models/test_collection_sorting.py new file mode 100644 index 0000000..64f0dfc --- /dev/null +++ b/tests/harmonization/models/test_collection_sorting.py @@ -0,0 +1,101 @@ +import datetime as dt + +from omop_etl.harmonization.models.patient import Patient +from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.medical_history import MedicalHistory + + +PATIENT_ID = "P001" + + +def _ae(start_date: dt.date | None, sequence_id: int | None, term: str = "nausea") -> AdverseEvent: + e = AdverseEvent(PATIENT_ID) + e.term = term + e.start_date = start_date + e.sequence_id = sequence_id + return e + + +class TestCollectionSorting: + def test_sorted_by_natural_key_on_assignment(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + unsorted = [ + _ae(dt.date(2024, 3, 1), 2), + _ae(dt.date(2024, 1, 1), 1), + _ae(dt.date(2024, 2, 1), 5), + ] + p.adverse_events = unsorted + + ordered = [(e.start_date, e.sequence_id) for e in p.adverse_events] + assert ordered == [ + (dt.date(2024, 1, 1), 1), + (dt.date(2024, 2, 1), 5), + (dt.date(2024, 3, 1), 2), + ] + + def test_tiebreak_by_secondary_natural_key_field(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + same_date = dt.date(2024, 1, 1) + p.adverse_events = [ + _ae(same_date, 3), + _ae(same_date, 1), + _ae(same_date, 2), + ] + assert [e.sequence_id for e in p.adverse_events] == [1, 2, 3] + + def test_none_values_sort_last(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + p.adverse_events = [ + _ae(None, 1), + _ae(dt.date(2024, 2, 1), 2), + _ae(dt.date(2024, 1, 1), 3), + ] + ordered = [(e.start_date, e.sequence_id) for e in p.adverse_events] + assert ordered == [ + (dt.date(2024, 1, 1), 3), + (dt.date(2024, 2, 1), 2), + (None, 1), + ] + + def test_none_in_secondary_field_sorts_last_within_group(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + same_date = dt.date(2024, 1, 1) + p.adverse_events = [ + _ae(same_date, None), + _ae(same_date, 2), + _ae(same_date, 1), + ] + assert [e.sequence_id for e in p.adverse_events] == [1, 2, None] + + def test_all_none_natural_key_is_stable(self): + """All-None keys produce equal sort keys, the order is stable.""" + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + a = _ae(None, None, term="A") + b = _ae(None, None, term="B") + c = _ae(None, None, term="C") + p.adverse_events = [a, b, c] + assert [e.term for e in p.adverse_events] == ["A", "B", "C"] + + def test_empty_collection(self): + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + p.adverse_events = [] + assert p.adverse_events == () + + def test_works_across_domain_types(self): + """Same mechanism works on a different collection with the same NK shape.""" + p = Patient(patient_id=PATIENT_ID, trial_id="T1") + mh1 = MedicalHistory(PATIENT_ID) + mh1.term = "hypertension" + mh1.start_date = dt.date(2024, 3, 1) + mh1.sequence_id = 1 + + mh2 = MedicalHistory(PATIENT_ID) + mh2.term = "diabetes" + mh2.start_date = dt.date(2024, 1, 1) + mh2.sequence_id = 2 + + p.medical_histories = [mh1, mh2] + assert [m.start_date for m in p.medical_histories] == [ + dt.date(2024, 1, 1), + dt.date(2024, 3, 1), + ] diff --git a/tests/harmonization/models/test_schema_validation.py b/tests/harmonization/models/test_schema_validation.py index d3b824e..92f90ed 100644 --- a/tests/harmonization/models/test_schema_validation.py +++ b/tests/harmonization/models/test_schema_validation.py @@ -9,12 +9,13 @@ from omop_etl.harmonization.models.domain.best_overall_response import BestOverallResponse from omop_etl.harmonization.models.domain.biomarkers import Biomarkers from omop_etl.harmonization.models.domain.c30 import C30 +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.ecog_baseline import EcogBaseline from omop_etl.harmonization.models.domain.eq5d import EQ5D from omop_etl.harmonization.models.domain.followup import FollowUp from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.study_drugs import StudyDrugs from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment import TumorAssessment @@ -34,10 +35,13 @@ def _get_all_constant_values(*sections) -> set[str]: return values +_DOMAIN_BASE_PROPERTIES = {name for name, attr in vars(DomainBase).items() if isinstance(attr, property)} + + def _get_data_properties(cls) -> set[str]: props = set() for name in dir(cls): - if name.startswith("_"): + if name.startswith("_") or name in _DOMAIN_BASE_PROPERTIES: continue attr = getattr(cls, name, None) if isinstance(attr, property): @@ -68,12 +72,13 @@ def test_no_extra_constants(self): BestOverallResponse, Biomarkers, C30, + ClinicalBenefit, ConcomitantMedication, EcogBaseline, EQ5D, FollowUp, MedicalHistory, - PreviousTreatments, + PreviousTreatment, StudyDrugs, TreatmentCycleComponent, TumorAssessment, diff --git a/tests/omop/builders/test_condition_occurrence.py b/tests/omop/builders/test_condition_occurrence.py index 97fe929..27a4937 100644 --- a/tests/omop/builders/test_condition_occurrence.py +++ b/tests/omop/builders/test_condition_occurrence.py @@ -426,3 +426,250 @@ def test_row_ids_are_deterministic(self, static_index, structural_index): rows_b = ConditionOccurrenceBuilder(concepts).build(create_build_context(patient, PERSON_ID)) assert rows_a[0].condition_occurrence_id == rows_b[0].condition_occurrence_id + + +class TestAdverseEventFKLinkage: + """ + CDM 5.4 observation_event_id linkage: ConditionOccurrenceBuilder publishes + each AE's sequence_id, condition_occurrence_id into BuildContext so + ObservationBuilder can attribute was_serious & turned_serious_date back to + the AE's condition row. + """ + + def _ae_semantic(self, leaf_index: int, concept_id: int, name: str) -> SemanticEntry: # noqa + return SemanticEntry( + patient_id=PID, + field_path=(Patient.Collections.ADVERSE_EVENTS, AdverseEvent.Fields.TERM), + leaf_index=leaf_index, + concept_id=concept_id, + name=name, + domain="condition", + ) + + def test_publishes_link_when_sequence_id_set(self, static_index, structural_index): + semantic = create_semantic_index(self._ae_semantic(0, 437663, "fever")) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 42 + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 1 + assert ctx.condition_id_by_ae_sequence_id == {42: rows[0].condition_occurrence_id} + + def test_no_link_when_sequence_id_missing_but_row_still_emitted(self, static_index, structural_index, caplog): + """AE without sequence_id: row is emitted, but produces no FK entry and warns.""" + import logging + + semantic = create_semantic_index(self._ae_semantic(0, 437663, "fever")) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + with caplog.at_level(logging.WARNING): + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 1, "AE row must still be emitted when sequence_id is missing" + assert ctx.condition_id_by_ae_sequence_id == {} + assert any("missing sequence_id" in rec.message for rec in caplog.records) + + def test_no_link_when_no_semantic_match(self, static_index, structural_index): + """AE with sequence_id but no semantic match emits no row and no FK entry.""" + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "UnmappedTerm" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 7 + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert rows == [] + assert ctx.condition_id_by_ae_sequence_id == {} + + def test_multi_ae_each_linked_by_sequence_id(self, static_index, structural_index): + """Multiple AEs get their own FK entry keyed by their sequence_id.""" + semantic = create_semantic_index( + self._ae_semantic(0, 437663, "fever"), + self._ae_semantic(1, 4329847, "nausea"), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + + ae1 = AdverseEvent(patient_id=PID) + ae1.term = "Fever" + ae1.start_date = dt.date(2023, 3, 1) + ae1.sequence_id = 1 + + ae2 = AdverseEvent(patient_id=PID) + ae2.term = "Nausea" + ae2.start_date = dt.date(2023, 4, 1) + ae2.sequence_id = 2 + + patient.adverse_events = [ae1, ae2] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 2 + # both sequence_ids present and pointing to existing row ids + emitted_ids = {r.condition_occurrence_id for r in rows} + assert set(ctx.condition_id_by_ae_sequence_id.keys()) == {1, 2} + assert set(ctx.condition_id_by_ae_sequence_id.values()).issubset(emitted_ids) + + def test_mixed_seq_id_present_and_missing(self, static_index, structural_index): + """One AE with sequence_id and one without: only the first is linked, both emit rows.""" + semantic = create_semantic_index( + self._ae_semantic(0, 437663, "fever"), + self._ae_semantic(1, 4329847, "nausea"), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + + ae1 = AdverseEvent(patient_id=PID) + ae1.term = "Fever" + ae1.start_date = dt.date(2023, 3, 1) + ae1.sequence_id = 1 + + ae2 = AdverseEvent(patient_id=PID) + ae2.term = "Nausea" + ae2.start_date = dt.date(2023, 4, 1) + + patient.adverse_events = [ae1, ae2] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 2 + assert set(ctx.condition_id_by_ae_sequence_id.keys()) == {1} + + def test_multi_concept_ae_links_to_first_row(self, static_index, structural_index): + """When one AE term maps to multiple condition concepts, FK links to the first emitted row.""" + semantic = create_semantic_index( + self._ae_semantic(0, 437663, "fever"), + self._ae_semantic(0, 999999, "alternative fever concept"), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 99 + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert len(rows) == 2 + # one FK entry: pointing to the first emitted row + assert ctx.condition_id_by_ae_sequence_id == {99: rows[0].condition_occurrence_id} + + def test_fk_publication_deterministic(self, static_index, structural_index): + """Two independent builds of the same patient produce identical FK state.""" + semantic = create_semantic_index(self._ae_semantic(0, 437663, "fever")) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 3, 1) + ae.sequence_id = 5 + patient.adverse_events = [ae] + + ctx_a = create_build_context(patient, PERSON_ID) + ctx_b = create_build_context(patient, PERSON_ID) + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_a) + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_b) + + assert ctx_a.condition_id_by_ae_sequence_id == ctx_b.condition_id_by_ae_sequence_id + assert ctx_a.condition_id_by_ae_sequence_id != {} + + +class TestPrimaryCancerFKPublication: + """ + Oncology CDM guideline: cancer-modifier Measurement rows (dimensions, + biomarkers, optional future metastasis/node/stage) should link back to the + primary cancer's condition_occurrence_id. ConditionOccurrenceBuilder + publishes that id from the tumor_type emission. + """ + + @staticmethod + def _tumor_semantic(concept_id: int) -> SemanticEntry: + return SemanticEntry( + patient_id=PID, + field_path=(Patient.Singletons.TUMOR_TYPE, TumorType.Fields.ICD10_CODE), + leaf_index=None, + concept_id=concept_id, + name="neoplasm", + domain="condition", + ) + + def test_publishes_primary_cancer_id_from_tumor_type(self, static_index, structural_index): + semantic = create_semantic_index(self._tumor_semantic(4000)) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + tumor = TumorType(patient_id=PID) + tumor.icd10_code = "C50.9" + tumor.date = dt.date(2022, 6, 1) + patient.tumor_type = tumor + ctx = create_build_context(patient, PERSON_ID) + + rows = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + tumor_row = next(r for r in rows if r.condition_concept_id == 4000) + assert ctx.condition_id_primary_cancer == tumor_row.condition_occurrence_id + + def test_no_primary_cancer_id_when_tumor_type_absent(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ctx = create_build_context(patient, PERSON_ID) + + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert ctx.condition_id_primary_cancer is None + + def test_no_primary_cancer_id_when_tumor_unmapped(self, static_index, structural_index): + """Tumor type present but no semantic match: no row and no FK published.""" + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + tumor = TumorType(patient_id=PID) + tumor.icd10_code = "C99.99" + tumor.date = dt.date(2022, 6, 1) + patient.tumor_type = tumor + ctx = create_build_context(patient, PERSON_ID) + + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx) + + assert ctx.condition_id_primary_cancer is None + + def test_multi_concept_tumor_picks_first_row_deterministically(self, static_index, structural_index): + """Two semantic matches for the tumor: two rows, FK is first row's id.""" + semantic = create_semantic_index( + self._tumor_semantic(4000), + self._tumor_semantic(4001), + ) + concepts = ConceptLookupService(static_index, structural_index, semantic) + patient = create_patient(PID, TRIAL) + tumor = TumorType(patient_id=PID) + tumor.icd10_code = "C50.9" + tumor.date = dt.date(2022, 6, 1) + patient.tumor_type = tumor + ctx_a = create_build_context(patient, PERSON_ID) + ctx_b = create_build_context(patient, PERSON_ID) + + rows_a = ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_a) + ConditionOccurrenceBuilder(concepts).build_and_populate(ctx_b) + + assert len(rows_a) == 2 + assert ctx_a.condition_id_primary_cancer == rows_a[0].condition_occurrence_id + assert ctx_a.condition_id_primary_cancer == ctx_b.condition_id_primary_cancer diff --git a/tests/omop/builders/test_drug_exposure_builder.py b/tests/omop/builders/test_drug_exposure_builder.py index 14f5f49..963925b 100644 --- a/tests/omop/builders/test_drug_exposure_builder.py +++ b/tests/omop/builders/test_drug_exposure_builder.py @@ -2,7 +2,7 @@ from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.patient import Patient from omop_etl.omop.builders.drug_exposure import DrugExposureBuilder @@ -337,7 +337,7 @@ def test_additional_treatment_maps_to_drug(self, static_index, structural_index) semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=1524674, name="zoledronic acid", @@ -346,7 +346,7 @@ def test_additional_treatment_maps_to_drug(self, static_index, structural_index) ) ) patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Chemotherapy" prev.additional_treatment = "Zometa" prev.start_date = dt.date(2022, 6, 1) @@ -366,7 +366,7 @@ def test_main_treatment_maps_to_drug(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=1304850, name="letrozole", @@ -375,7 +375,7 @@ def test_main_treatment_maps_to_drug(self, static_index, structural_index): ) ) patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Letrozole" prev.additional_treatment = "Additional" prev.start_date = dt.date(2022, 1, 1) @@ -391,7 +391,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=100, name="drug a", @@ -400,7 +400,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i ), SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=200, name="drug b", @@ -409,7 +409,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i ), ) patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Drug A" prev.additional_treatment = "Drug B" prev.start_date = dt.date(2022, 1, 1) @@ -425,7 +425,7 @@ def test_both_fields_map_to_drug_emits_two_rows(self, static_index, structural_i def test_no_drug_mapping_produces_no_row(self, static_index, structural_index): """Treatment fields without drug domain mappings produce no rows.""" patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "paracetamol" prev.start_date = dt.date(2022, 6, 1) patient.previous_treatments = [prev] @@ -436,7 +436,7 @@ def test_no_drug_mapping_produces_no_row(self, static_index, structural_index): def test_missing_start_date_skips(self, static_index, structural_index): patient = create_patient(PID, "test") - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Chemotherapy" prev.additional_treatment = "Zometa" patient.previous_treatments = [prev] @@ -556,7 +556,7 @@ def test_ids_unique_across_sources(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=1524674, name="zoledronic acid", @@ -571,7 +571,7 @@ def test_ids_unique_across_sources(self, static_index, structural_index): cycle.start_date = dt.date(2023, 1, 1) patient.treatment_cycles = [cycle] - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Chemotherapy" prev.additional_treatment = "Zometa" prev.start_date = dt.date(2022, 1, 1) diff --git a/tests/omop/builders/test_measurement.py b/tests/omop/builders/test_measurement.py index f77e20c..05a9460 100644 --- a/tests/omop/builders/test_measurement.py +++ b/tests/omop/builders/test_measurement.py @@ -13,8 +13,12 @@ from omop_etl.omop.builders.measurement import MeasurementBuilder from omop_etl.omop.builders.visit_occurrence import VisitOccurrenceBuilder from omop_etl.omop.core.id_generator import sha1_bigint +import pytest + from tests.omop.conftest import ( SemanticEntry, + _static, + _structural, create_build_context, create_patient, create_semantic_index, @@ -302,7 +306,7 @@ def test_irecist_with_divergent_source_string(self, static_index, structural_ind def test_unmapped_response_is_skipped(self, static_index, structural_index): patient = create_patient(PID, TRIAL) patient.tumor_assessments = [ - _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="Not Evaluable (NE)"), + _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="invalid"), ] rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(create_build_context(patient, PERSON_ID)) @@ -312,6 +316,47 @@ def test_unmapped_response_is_skipped(self, static_index, structural_index): assert rows[0].measurement_concept_id == 4084390 assert rows[0].value_as_number == 28.987 + def test_only_not_evaluable(self, static_index, structural_index): + patient = create_patient(PID, TRIAL) + patient.tumor_assessments = [ + _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="Not Evaluable (NE)"), + ] + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(create_build_context(patient, PERSON_ID)) + + # size row and Not Evaluable row + assert len(rows) == 2 + assert rows[0].measurement_concept_id == 4084390 # lesion size + assert rows[0].value_as_number == 28.987 + assert rows[1].measurement_concept_id == 734317 # RECIST structural + assert rows[1].value_as_concept_id == 45878793 # NE qualifier + assert rows[1].value_as_number is None + + def test_not_evaluable_and_evaluable_produce_four_rows(self, static_index, structural_index): + patient = create_patient(PID, TRIAL) + patient.tumor_assessments = [ + _make_tumor_assessments(dt.date(2040, 11, 22), "V05", size=28.987, recist="Stable Disease (SD)"), + _make_tumor_assessments(dt.date(2040, 12, 22), "V06", size=300.0, recist="Not Evaluable"), + ] + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(create_build_context(patient, PERSON_ID)) + + # each TumorAssessment produces its own size row + its recist row: 4 total + assert len(rows) == 4 + + # V05: size & precoordinated SD response + assert rows[0].measurement_concept_id == 4084390 # lesion size + assert rows[0].value_as_number == 28.987 + assert rows[1].measurement_concept_id == 1634680 # RECIST SD precoordinated + assert rows[1].value_as_concept_id is None + + # V06: size & NE (structural RECIST and NE qualifier) + assert rows[2].measurement_concept_id == 4084390 + assert rows[2].value_as_number == 300.0 + assert rows[3].measurement_concept_id == 734317 + assert rows[3].value_as_concept_id == 45878793 + assert rows[3].value_as_number is None + def test_missing_date_returns_empty_for_instance(self, static_index, structural_index): patient = create_patient(PID, TRIAL) ta = TumorAssessment(PID) @@ -589,7 +634,7 @@ def test_dimension_uses_precoordinated_concept(self, static_index, structural_in assert row_1.person_id == PERSON_ID assert row_1.measurement_date == dt.date(2040, 5, 1) assert row_1.measurement_datetime == dt.datetime(2040, 5, 1) - assert row_1.measurement_id == 3952701007853139582 + assert row_1.measurement_id == 5607913108096982206 # fixme: assert on expected hash from collection's natural key instead # q2 level 5 row_2 = rows[1] @@ -1448,3 +1493,185 @@ def test_row_id_deterministic(self, static_index, structural_index): rows_2 = MeasurementBuilder(ConceptLookupService(static_index, structural_index, semantic)).build(context) assert rows_1[0].measurement_id == rows_2[0].measurement_id + + +CDM_FIELD_CID = 1147127 +UNIT_MM_CID = 8588 + + +def _with_cdm_field(static_index: dict) -> dict: + """Add the cdm_field static entry used to identify the FK target field.""" + static_index[("cdm_field", "condition_occurrence.condition_occurrence_id")] = _static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + CDM_FIELD_CID, + "metadata", + ) + return static_index + + +def _with_millimeter(structural_index: dict) -> dict: + """Add the millimeter unit concept (UCUM) so lesion-size rows can populate unit_concept_id.""" + structural_index["millimeter"] = _structural("millimeter", UNIT_MM_CID, "unit") + return structural_index + + +class TestPrimaryCancerFKConsumption: + """ + MeasurementBuilder consumes BuildContext.condition_id_primary_cancer + (published by ConditionOccurrenceBuilder) to set + measurement_event_id + meas_event_field_concept_id on lesion-size + (TumorAssessmentBaseline + TumorAssessment) and biomarker rows. + + Cancer modifier rows (lesion size as Dimension of Tumor, biomarkers) link + back to the primary cancer condition via measurement_event_id + + meas_event_field_concept_id, per oncology CDM guidelines. + """ + + @staticmethod + def _baseline_patient() -> Patient: + patient = create_patient(PID, TRIAL) + baseline = TumorAssessmentBaseline(PID) + baseline.target_lesion_size = 41 + baseline.target_lesion_measurement_date = dt.date(2040, 4, 19) + patient.tumor_assessment_baseline = baseline + return patient + + def test_baseline_lesion_size_links_to_primary_cancer(self, static_index, structural_index): + _with_cdm_field(static_index) + _with_millimeter(structural_index) + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 12345 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.measurement_event_id == 12345 + assert row.meas_event_field_concept_id == CDM_FIELD_CID + assert row.unit_concept_id == UNIT_MM_CID + + def test_baseline_lesion_size_no_fk_when_primary_cancer_not_published(self, static_index, structural_index): + _with_millimeter(structural_index) + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + # ctx.condition_id_primary_cancer left as None + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + assert rows[0].measurement_event_id is None + assert rows[0].meas_event_field_concept_id is None + # unit_concept_id is independent of FK linkage: still populated + assert rows[0].unit_concept_id == UNIT_MM_CID + + def test_baseline_lesion_size_unit_missing_falls_back_to_none(self, static_index, structural_index): + """structural index without millimeter: unit_concept_id is None.""" + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + assert rows[0].unit_concept_id is None + + def test_baseline_lesion_size_raises_when_primary_cancer_published_but_cdm_field_missing(self, static_index, structural_index): + """If a primary cancer condition is published, the cdm_field entry is required""" + _with_millimeter(structural_index) + patient = self._baseline_patient() + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 12345 + + with pytest.raises(RuntimeError, match="cdm_field"): + MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + def test_tumor_assessment_lesion_size_links_to_primary_cancer(self, static_index, structural_index): + _with_cdm_field(static_index) + _with_millimeter(structural_index) + patient = create_patient(PID, TRIAL) + patient.tumor_assessments = [ + _make_tumor_assessments(dt.date(2040, 6, 14), "V03", size=20.5), + ] + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 67890 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + size_rows = [r for r in rows if r.measurement_concept_id == 4084390] + assert len(size_rows) == 1 + assert size_rows[0].measurement_event_id == 67890 + assert size_rows[0].meas_event_field_concept_id == CDM_FIELD_CID + assert size_rows[0].unit_concept_id == UNIT_MM_CID + + def test_biomarker_links_to_primary_cancer(self, static_index, structural_index): + _with_cdm_field(static_index) + semantic = create_semantic_index( + SemanticEntry( + patient_id=PID, + field_path=(Patient.Singletons.BIOMARKERS, Biomarkers.Fields.COHORT_TARGET_MUTATION), + leaf_index=None, + concept_id=4000, + name="braf non-v600", + domain="measurement", + ) + ) + patient = create_patient(PID, TRIAL) + biomarkers = Biomarkers(PID) + biomarkers.cohort_target_mutation = "BRAF non-V600" + biomarkers.date = dt.date(2040, 1, 1) + patient.biomarkers = biomarkers + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 77777 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index, semantic)).build(ctx) + + assert len(rows) == 1 + assert rows[0].measurement_event_id == 77777 + assert rows[0].meas_event_field_concept_id == CDM_FIELD_CID + + def test_biomarker_no_fk_when_primary_cancer_not_published(self, static_index, structural_index): + semantic = create_semantic_index( + SemanticEntry( + patient_id=PID, + field_path=(Patient.Singletons.BIOMARKERS, Biomarkers.Fields.COHORT_TARGET_MUTATION), + leaf_index=None, + concept_id=4000, + name="braf non-v600", + domain="measurement", + ) + ) + patient = create_patient(PID, TRIAL) + biomarkers = Biomarkers(PID) + biomarkers.cohort_target_mutation = "BRAF non-V600" + biomarkers.date = dt.date(2040, 1, 1) + patient.biomarkers = biomarkers + ctx = create_build_context(patient, PERSON_ID) + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index, semantic)).build(ctx) + + assert len(rows) == 1 + assert rows[0].measurement_event_id is None + assert rows[0].meas_event_field_concept_id is None + + def test_non_cancer_modifier_rows_have_no_fk(self, static_index, structural_index): + """ + ECOG, C30, EQ5D, AE-measurement, MH-measurement rows are not + cancer modifiers and should not link to primary cancer. + Verified here on ECOG (AE/MH rows are tested elsewhere). + """ + _with_cdm_field(static_index) + patient = create_patient(PID, TRIAL) + ecog = EcogBaseline(PID) + ecog.grade = 1 + ecog.date = dt.date(2040, 1, 1) + patient.ecog_baseline = ecog + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_primary_cancer = 99999 + + rows = MeasurementBuilder(ConceptLookupService(static_index, structural_index)).build(ctx) + + assert len(rows) == 1 + # ECOG rows are not cancer modifiers: no FK + assert rows[0].measurement_event_id is None + assert rows[0].meas_event_field_concept_id is None diff --git a/tests/omop/builders/test_observation_builder.py b/tests/omop/builders/test_observation_builder.py new file mode 100644 index 0000000..d452526 --- /dev/null +++ b/tests/omop/builders/test_observation_builder.py @@ -0,0 +1,724 @@ +import datetime as dt +import logging +import pytest + +from omop_etl.concept_mapping.service import ConceptLookupService +from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent +from omop_etl.harmonization.models.domain.clinical_benefit import ClinicalBenefit +from omop_etl.harmonization.models.domain.followup import FollowUp +from omop_etl.harmonization.models.patient import Patient +from omop_etl.omop.builders.observation import ObservationBuilder +from omop_etl.omop.core.id_generator import sha1_bigint +from tests.omop.conftest import ( + _static, + _structural, + create_build_context, + create_patient, +) + +PID = "p1" +TRIAL = "test" +PERSON_ID = sha1_bigint("person", PID) + +YES_CID = 4188539 +NO_CID = 4188540 +CDM_FIELD_CID = 1147127 +AE_OUTCOME_TOPIC_CID = 4231813 + + +def _with_yes_no(structural_index: dict) -> dict: + """Yes/No structural Meas Value concepts (OHDSI ETL convention for booleans).""" + structural_index["yes"] = _structural("yes", YES_CID, "meas value") + structural_index["no"] = _structural("no", NO_CID, "meas value") + return structural_index + + +def _with_cdm_field(static_index: dict) -> dict: + """cdm_field static entry for AE: condition_occurrence FK linkage.""" + static_index[("cdm_field", "condition_occurrence.condition_occurrence_id")] = _static( + "cdm_field", + "condition_occurrence.condition_occurrence_id", + CDM_FIELD_CID, + "metadata", + ) + return static_index + + +def _with_ae_outcome_topic(structural_index: dict) -> dict: + structural_index["adverse_event_outcome"] = _structural("adverse_event_outcome", AE_OUTCOME_TOPIC_CID, "observation") + return structural_index + + +class TestObservationBuilder: + def test_table_name(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + assert ObservationBuilder(concepts).table_name == "observation" + + def test_empty_patient_returns_empty(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + +class TestEvaluableForEfficacy: + """ + Pattern 1 (unmapped source attribute): + concept_id=0, source_value=field name, value_source_value=lowercase literal. + """ + + def test_true_emits_row_with_yes_value(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + evaluable_for_efficacy_analysis=True, + treatment_start_date=dt.date(2023, 1, 10), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 1, 10) + assert row.observation_type_concept_id == 32817 + assert row.observation_source_value == "evaluable_for_efficacy_analysis" + assert row.observation_source_concept_id == 0 + assert row.value_as_concept_id == YES_CID + assert row.value_source_value == "true" + + def test_false_emits_row_with_no_value(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + evaluable_for_efficacy_analysis=False, + treatment_start_date=dt.date(2023, 1, 10), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].value_source_value == "false" + + def test_yes_no_missing_falls_back_to_zero(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + evaluable_for_efficacy_analysis=True, + treatment_start_date=dt.date(2023, 1, 10), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == 0 + + def test_skipped_when_value_is_none(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, treatment_start_date=dt.date(2023, 1, 10)) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_skipped_when_treatment_start_date_missing(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, evaluable_for_efficacy_analysis=True) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("treatment_start_date" in rec.message for rec in caplog.records) + + +class TestClinicalBenefit: + """ + Pattern 1 (no topic concept): concept_id=0, observation_source_value + encodes the week as `has_clinical_benefit_at_week_`, value_as_concept_id + is Yes/No concepts, observation_date is the singleton's date. + """ + + @staticmethod + def _make_singleton( + *, + has_benefit: bool | None, + week: int | None = 16, + date: dt.date | None = dt.date(2023, 4, 20), + ) -> ClinicalBenefit: + cb = ClinicalBenefit(patient_id=PID) + cb.week = week + cb.has_benefit = has_benefit + cb.date = date + return cb + + def test_emits_row_with_yes_for_true(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=True) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 4, 20) + assert row.observation_source_value == "has_clinical_benefit_at_week_16" + assert row.observation_source_concept_id == 0 + assert row.value_as_concept_id == YES_CID + assert row.value_source_value == "true" + + def test_emits_row_with_no_for_false(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=False) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].value_source_value == "false" + + def test_source_value_encodes_week_for_other_timepoints(self, static_index, structural_index): + """Week 24 from another source produces: `has_clinical_benefit_at_week_24`.""" + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton( + has_benefit=True, + week=24, + date=dt.date(2023, 6, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].observation_source_value == "has_clinical_benefit_at_week_24" + assert rows[0].observation_date == dt.date(2023, 6, 1) + + def test_singleton_absent_returns_empty(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_skipped_when_has_benefit_is_none(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=None) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_skipped_when_date_is_none(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=True, date=None) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("no date" in rec.message for rec in caplog.records) + + def test_skipped_when_week_is_none(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + patient.clinical_benefit = self._make_singleton(has_benefit=True, week=None) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("no week" in rec.message for rec in caplog.records) + + +class TestEndOfTreatmentReason: + """ + Pattern 1: concept_id=0, field name in source_value, mapped reason concept + in value_as_concept_id (or 0 if unmapped), raw reason preserved in both + value_as_string and value_source_value. + """ + + def test_mapped_reason_emits_row(self, static_index, structural_index): + static_index[("eot_reason", "disease progression")] = _static("eot_reason", "disease progression", 1617595, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + end_of_treatment_reason="Disease progression", + end_of_treatment_date=dt.date(2023, 8, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 8, 1) + assert row.observation_source_value == "end_of_treatment_reason" + assert row.observation_source_concept_id == 0 + assert row.value_as_concept_id == 1617595 + assert row.value_as_string == "Disease progression" + assert row.value_source_value == "Disease progression" + + def test_unmapped_reason_emits_row_with_value_concept_zero(self, static_index, structural_index): + """ + No static mapping: row still emits, value_as_concept_id=0, raw + reason preserved in value_as_string and value_source_value. + """ + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + end_of_treatment_reason="Some new reason not in mapping", + end_of_treatment_date=dt.date(2023, 8, 1), + ) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.value_as_concept_id == 0 + assert row.value_as_string == "Some new reason not in mapping" + assert row.value_source_value == "Some new reason not in mapping"[:50] + assert row.observation_source_value == "end_of_treatment_reason" + + def test_skipped_without_eot_date(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL, end_of_treatment_reason="Disease progression") + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("end_of_treatment_date" in rec.message for rec in caplog.records) + + +class TestLostToFollowup: + """ + Pattern 2 (mapped topic): concept_id=Lost-to-follow-up, + source_value=field name, value_source_value=lowercase literal. + """ + + def test_lost_to_followup_true_emits_row(self, static_index, structural_index): + _with_yes_no(structural_index) + static_index[("lost_to_followup", "true")] = _static("lost_to_followup", "true", 4163894, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = True + followup.date_lost_to_followup = dt.date(2023, 12, 1) + patient.lost_to_followup = followup + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 4163894 + assert row.observation_date == dt.date(2023, 12, 1) + assert row.value_as_concept_id == YES_CID + assert row.observation_source_value == "lost_to_followup" + assert row.observation_source_concept_id == 0 + assert row.value_source_value == "true" + + def test_lost_to_followup_false_emits_row_with_no_value(self, static_index, structural_index): + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = False + followup.date_lost_to_followup = dt.date(2023, 12, 1) + patient.lost_to_followup = followup + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert len(rows) == 1 + assert rows[0].observation_concept_id == 0 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].observation_source_value == "lost_to_followup" + assert rows[0].value_source_value == "false" + + def test_singleton_absent_returns_empty(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_missing_date_skips(self, static_index, structural_index, caplog): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = True + patient.lost_to_followup = followup + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + assert any("date_lost_to_followup" in rec.message for rec in caplog.records) + + +class TestAdverseEventOutcome: + """Pattern 2 and 3: topic concept (structural adverse_event_outcome) and + answer concept (static adverse_event_outcome,). Either lookup + can miss and the row still emits with concept_id=0 fallback, as long + as outcome and start_date are present. FK-linked.""" + + def _make_patient(self, outcome: str | None) -> Patient: # noqa + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.outcome = outcome + ae.sequence_id = 1 + patient.adverse_events = [ae] + return patient + + def test_mapped_outcome_emits_row(self, static_index, structural_index): + _with_ae_outcome_topic(structural_index) + _with_cdm_field(static_index) + static_index[("adverse_event_outcome", "recovering/resolving")] = _static("adverse_event_outcome", "recovering/resolving", 1074213, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Recovering/resolving") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == AE_OUTCOME_TOPIC_CID + assert row.observation_date == dt.date(2023, 5, 1) + assert row.value_as_concept_id == 1074213 + assert row.observation_source_value == "outcome" + assert row.observation_source_concept_id == 0 + assert row.value_source_value == "Recovering/resolving" + assert row.observation_event_id == 999 + assert row.obs_event_field_concept_id == CDM_FIELD_CID + + def test_topic_structural_missing_falls_back_to_zero(self, static_index, structural_index): + """ + No topic structural: concept_id=0 but row still emits with mapped + value and raw outcome preserved in value_source_value. + """ + _with_cdm_field(static_index) + static_index[("adverse_event_outcome", "recovering/resolving")] = _static("adverse_event_outcome", "recovering/resolving", 1074213, "observation") + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Recovering/resolving") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.value_as_concept_id == 1074213 + assert row.value_source_value == "Recovering/resolving" + + def test_value_static_missing_falls_back_to_zero(self, static_index, structural_index): + """ + No static mapping for the outcome text: value_as_concept_id=0, + row still emits with topic concept and raw outcome preserved. + """ + _with_ae_outcome_topic(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Some unmapped outcome") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == AE_OUTCOME_TOPIC_CID + assert row.value_as_concept_id == 0 + assert row.value_source_value == "Some unmapped outcome" + + def test_both_lookups_missing_emits_zero_row_with_raw_value(self, static_index, structural_index): + """ + Worst case: no mappings at all, row still emits with both + concept ids 0 and value_source_value preserving the raw text. + """ + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient("Some outcome") + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 999 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.value_as_concept_id == 0 + assert row.value_source_value == "Some outcome" + assert row.observation_source_value == "outcome" + + def test_outcome_none_emits_nothing(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(None) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + +class TestAdverseEventWasSerious: + """ + Pattern 3: concept_id=0 and FK linkage. Emits for both True and False + (records the assessment either way). + """ + + def _make_patient(self, was_serious: bool | None) -> Patient: # noqa + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.sequence_id = 42 + ae.was_serious = was_serious + patient.adverse_events = [ae] + return patient + + def test_was_serious_true_emits_row_with_fk(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(True) + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[42] = 123456789 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 5, 1) + assert row.value_as_concept_id == YES_CID + assert row.observation_source_value == "was_serious" + assert row.observation_source_concept_id == 0 + assert row.value_source_value == "true" + assert row.observation_event_id == 123456789 + assert row.obs_event_field_concept_id == CDM_FIELD_CID + + def test_was_serious_false_emits_row_with_no_value(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(False) + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[42] = 123456789 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + assert rows[0].value_as_concept_id == NO_CID + assert rows[0].value_source_value == "false" + + def test_was_serious_none_emits_nothing(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(None) + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + def test_no_fk_when_no_condition_row_published(self, static_index, structural_index, caplog): + """ + AE with sequence_id but no published condition_occurrence row: + observation still emits, FK fields left blank, warning logged. + """ + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(True) + ctx = create_build_context(patient, PERSON_ID) + # condition_id_by_ae_sequence_id stays empty + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + assert rows[0].observation_event_id is None + assert rows[0].obs_event_field_concept_id is None + assert any("missing event_id" in rec.message for rec in caplog.records) + + def test_no_fk_when_ae_missing_sequence_id(self, static_index, structural_index, caplog): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.was_serious = True + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + + with caplog.at_level(logging.WARNING): + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + assert rows[0].observation_event_id is None + assert rows[0].obs_event_field_concept_id is None + assert any("missing sequence_id" in rec.message for rec in caplog.records) + + def test_raises_when_cdm_field_missing_but_fk_resolvable(self, static_index, structural_index): + """ + cdm_field is required: builder raises. + """ + _with_yes_no(structural_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = self._make_patient(True) + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[42] = 987654321 + + with pytest.raises(RuntimeError, match="cdm_field"): + ObservationBuilder(concepts).build(ctx) + + +class TestAdverseEventTurnedSerious: + def test_emits_row_on_turned_serious_date(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.sequence_id = 7 + ae.turned_serious_date = dt.date(2023, 5, 5) + patient.adverse_events = [ae] + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[7] = 555 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 1 + row = rows[0] + assert row.observation_concept_id == 0 + assert row.observation_date == dt.date(2023, 5, 5) + assert row.value_as_concept_id == YES_CID + assert row.observation_source_value == "turned_serious_date" + assert row.value_source_value == "2023-05-05" + assert row.observation_event_id == 555 + assert row.obs_event_field_concept_id == CDM_FIELD_CID + + def test_skipped_when_turned_serious_date_unset(self, static_index, structural_index): + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.sequence_id = 7 + patient.adverse_events = [ae] + + rows = ObservationBuilder(concepts).build(create_build_context(patient, PERSON_ID)) + + assert rows == [] + + +class TestCombinedSources: + def test_multi_source_uniqueness_and_determinism(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + _with_ae_outcome_topic(structural_index) + static_index[("eot_reason", "other")] = _static("eot_reason", "other", 35821954, "observation") + static_index[("lost_to_followup", "true")] = _static("lost_to_followup", "true", 4163894, "observation") + static_index[("adverse_event_outcome", "fatal")] = _static("adverse_event_outcome", "fatal", 4236718, "observation") + + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient( + PID, + TRIAL, + treatment_start_date=dt.date(2023, 1, 10), + evaluable_for_efficacy_analysis=True, + end_of_treatment_reason="Other", + end_of_treatment_date=dt.date(2023, 8, 1), + ) + + cb = ClinicalBenefit(patient_id=PID) + cb.week = 16 + cb.has_benefit = False + cb.date = dt.date(2023, 4, 25) + patient.clinical_benefit = cb + + followup = FollowUp(patient_id=PID) + followup.lost_to_followup = True + followup.date_lost_to_followup = dt.date(2023, 9, 1) + patient.lost_to_followup = followup + + ae = AdverseEvent(patient_id=PID) + ae.term = "Fever" + ae.start_date = dt.date(2023, 5, 1) + ae.outcome = "Fatal" + ae.was_serious = True + ae.turned_serious_date = dt.date(2023, 5, 5) + ae.sequence_id = 11 + patient.adverse_events = [ae] + + ctx_a = create_build_context(patient, PERSON_ID) + ctx_a.condition_id_by_ae_sequence_id[11] = 42 + ctx_b = create_build_context(patient, PERSON_ID) + ctx_b.condition_id_by_ae_sequence_id[11] = 42 + + rows_a = ObservationBuilder(concepts).build(ctx_a) + rows_b = ObservationBuilder(concepts).build(ctx_b) + + # 4 scalars/singleton (evaluable, clinical_benefit, eot, lost_to_followup) + # and 3 AE-derived (outcome, was_serious, turned_serious) = 7 rows + assert len(rows_a) == 7 + ids = [r.observation_id for r in rows_a] + assert len(ids) == len(set(ids)), "All observation_ids must be unique" + + ids_b = sorted(r.observation_id for r in rows_b) + assert sorted(ids) == ids_b + + def test_multiple_adverse_events_each_independent(self, static_index, structural_index): + _with_yes_no(structural_index) + _with_cdm_field(static_index) + concepts = ConceptLookupService(static_index, structural_index) + patient = create_patient(PID, TRIAL) + + ae1 = AdverseEvent(patient_id=PID) + ae1.term = "Fever" + ae1.start_date = dt.date(2023, 5, 1) + ae1.sequence_id = 1 + ae1.was_serious = True + + ae2 = AdverseEvent(patient_id=PID) + ae2.term = "Nausea" + ae2.start_date = dt.date(2023, 6, 1) + ae2.sequence_id = 2 + ae2.was_serious = True + + patient.adverse_events = [ae1, ae2] + ctx = create_build_context(patient, PERSON_ID) + ctx.condition_id_by_ae_sequence_id[1] = 100 + ctx.condition_id_by_ae_sequence_id[2] = 200 + + rows = ObservationBuilder(concepts).build(ctx) + + assert len(rows) == 2 + by_event_id = {r.observation_event_id: r for r in rows} + assert set(by_event_id.keys()) == {100, 200} + assert by_event_id[100].observation_date == dt.date(2023, 5, 1) + assert by_event_id[200].observation_date == dt.date(2023, 6, 1) diff --git a/tests/omop/builders/test_procedure_occurrence.py b/tests/omop/builders/test_procedure_occurrence.py index 0274613..9db8d2c 100644 --- a/tests/omop/builders/test_procedure_occurrence.py +++ b/tests/omop/builders/test_procedure_occurrence.py @@ -2,7 +2,7 @@ from omop_etl.concept_mapping.service import ConceptLookupService from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.patient import Patient from omop_etl.omop.builders.procedure_occurrence import ProcedureOccurrenceBuilder from omop_etl.omop.core.id_generator import sha1_bigint @@ -37,7 +37,7 @@ def test_all_fields(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -46,7 +46,7 @@ def test_all_fields(self, static_index, structural_index): ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) prev.end_date = dt.date(2021, 3, 1) @@ -66,7 +66,7 @@ def test_all_fields(self, static_index, structural_index): def test_no_procedure_match_skips(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] @@ -78,7 +78,7 @@ def test_no_procedure_match_skips(self, static_index, structural_index): def test_missing_start_date_skips(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" patient.previous_treatments = [prev] @@ -90,7 +90,7 @@ def test_end_date_can_be_none(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -99,7 +99,7 @@ def test_end_date_can_be_none(self, static_index, structural_index): ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] @@ -115,7 +115,7 @@ def test_additional_treatment_produces_row(self, static_index, structural_index) semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=4061650, name="hormone therapy", @@ -124,7 +124,7 @@ def test_additional_treatment_produces_row(self, static_index, structural_index) ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Other" prev.additional_treatment = "Hormone therapy" prev.start_date = dt.date(2021, 5, 1) @@ -139,7 +139,7 @@ def test_additional_treatment_produces_row(self, static_index, structural_index) def test_no_match_skips(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Other" prev.additional_treatment = "Something unmapped" prev.start_date = dt.date(2021, 5, 1) @@ -154,7 +154,7 @@ def test_both_fields_produce_separate_rows(self, static_index, structural_index) semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -162,7 +162,7 @@ def test_both_fields_produce_separate_rows(self, static_index, structural_index) ), SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.ADDITIONAL_TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.ADDITIONAL_TREATMENT), leaf_index=0, concept_id=4061650, name="hormone therapy", @@ -171,7 +171,7 @@ def test_both_fields_produce_separate_rows(self, static_index, structural_index) ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.additional_treatment = "Hormone therapy" prev.start_date = dt.date(2021, 3, 1) @@ -247,7 +247,7 @@ def test_all_sources_combined(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -265,7 +265,7 @@ def test_all_sources_combined(self, static_index, structural_index): concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] @@ -286,7 +286,7 @@ def test_row_ids_are_deterministic(self, static_index, structural_index): semantic = create_semantic_index( SemanticEntry( patient_id=PID, - field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + field_path=(Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), leaf_index=0, concept_id=4301351, name="surgical procedure", @@ -295,7 +295,7 @@ def test_row_ids_are_deterministic(self, static_index, structural_index): ) concepts = ConceptLookupService(static_index, structural_index, semantic) patient = create_patient(PID, TRIAL) - prev = PreviousTreatments(patient_id=PID) + prev = PreviousTreatment(patient_id=PID) prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev] diff --git a/tests/omop/conftest.py b/tests/omop/conftest.py index cf04d13..1fbb208 100644 --- a/tests/omop/conftest.py +++ b/tests/omop/conftest.py @@ -131,6 +131,9 @@ def structural_index() -> dict[str, StructuralConcept]: "c30_q29": _structural("c30_q29", 701367, "measurement"), # EQ5D VAS "eq5d_qol_score": _structural("eq5d_qol_score", 42537274, "measurement"), + "response_recist": _structural("response_recist", 734317, "measurement"), + "response_irecist": _structural("response_irecist", 734318, "measurement"), + "response_ranop": _structural("response_rano", 734345, "measurement"), } @@ -165,16 +168,25 @@ def static_index() -> dict[tuple[str, str], StaticConcept]: ("eq5d_q2_answer_code", "3"): _static("eq5d_q2_answer_code", "3", 742353, "measurement"), ("eq5d_q2_answer_code", "4"): _static("eq5d_q2_answer_code", "4", 742354, "measurement"), ("eq5d_q2_answer_code", "5"): _static("eq5d_q2_answer_code", "5", 742355, "measurement"), - # tumor-response scales + # tumor-response scales: + # recist + ("response_recist", "not evaluable"): _static("response_recist", "Not evaluable", 45878793, "Meas value"), + ("response_recist", "not evaluable (ne)"): _static("response_recist", "Not evaluable (NE)", 45878793, "Meas value"), ("response_recist", "complete response (cr)"): _static("response_recist", "complete response (cr)", 1634772, "measurement"), ("response_recist", "partial response (pr)"): _static("response_recist", "partial response (pr)", 1633368, "measurement"), ("response_recist", "stable disease (sd)"): _static("response_recist", "stable disease (sd)", 1634680, "measurement"), ("response_recist", "progressive disease (pd)"): _static("response_recist", "progressive disease (pd)", 1633597, "measurement"), + # irecist + ("response_irecist", "not evaluable"): _static("response_irecist", "Not evaluable", 45878793, "Meas value"), + ("response_irecist", "not evaluable (ne)"): _static("response_irecist", "Not evaluable (NE)", 45878793, "Meas value"), ("response_irecist", "icomplete response (cr)"): _static("response_irecist", "icomplete response (cr)", 1633954, "measurement"), ("response_irecist", "ipartial response (pr)"): _static("response_irecist", "ipartial response (pr)", 1635284, "measurement"), ("response_irecist", "istable disease"): _static("response_irecist", "istable disease", 1635887, "measurement"), ("response_irecist", "iconfirmed progressive disease"): _static("response_irecist", "iconfirmed progressive disease", 1633423, "measurement"), ("response_irecist", "iunconfirmed progressive disease"): _static("response_irecist", "iunconfirmed progressive disease", 1633423, "measurement"), + # rano + ("response_rano", "not evaluable"): _static("response_rano", "Not evaluable", 45878793, "Meas value"), + ("response_rano", "not evaluable (ne)"): _static("response_rano", "Not evaluable (NE)", 45878793, "Meas value"), ("response_rano", "complete response (cr)"): _static("response_rano", "complete response (cr)", 1634853, "measurement"), ("response_rano", "partial response (pr)"): _static("response_rano", "partial response (pr)", 1634574, "measurement"), ("response_rano", "stable disease (sd)"): _static("response_rano", "stable disease (sd)", 1633447, "measurement"), diff --git a/tests/omop/test_service.py b/tests/omop/test_service.py index 86e5845..69ee0a7 100644 --- a/tests/omop/test_service.py +++ b/tests/omop/test_service.py @@ -4,7 +4,7 @@ from omop_etl.harmonization.models.domain.adverse_event import AdverseEvent from omop_etl.harmonization.models.domain.concomitant_medication import ConcomitantMedication from omop_etl.harmonization.models.domain.medical_history import MedicalHistory -from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatments +from omop_etl.harmonization.models.domain.previous_treatments import PreviousTreatment from omop_etl.harmonization.models.domain.treatment_cycle_component import TreatmentCycleComponent from omop_etl.harmonization.models.domain.tumor_assessment_baseline import TumorAssessmentBaseline from omop_etl.harmonization.models.domain.tumor_type import TumorType @@ -83,7 +83,7 @@ def test_all_builders_produce_output(self, static_index, structural_index): ), SemanticEntry( "p1", - (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatments.Fields.TREATMENT), + (Patient.Collections.PREVIOUS_TREATMENTS, PreviousTreatment.Fields.TREATMENT), 0, 4301351, "surgery", @@ -127,7 +127,7 @@ def test_all_builders_produce_output(self, static_index, structural_index): concom.sequence_id = 1 patient.concomitant_medications = [concom] - prev = PreviousTreatments(patient_id="p1") + prev = PreviousTreatment(patient_id="p1") prev.treatment = "Surgery" prev.start_date = dt.date(2021, 3, 1) patient.previous_treatments = [prev]