Albrja/mic-5971/lbwsg-exposure-data (#524)

albrja · web-flow · commit e5505e2ca313 · 2025-06-11T11:27:54.000-07:00
Albrja/mic-5971/lbwsg-exposure-data Update LBWSG components to use birth_exposure or exposure artifact keys depending on simulant age - *Category*: Feature - *JIRA issue*: https://jira.ihme.washington.edu/browse/MIC-5971 Changes and notes -update LBWSG risk to determine which artifact key/pipeline to use depending upon simulants age_end -updates LBWSG DIstribution component to create lookup tables for both birth_exposure and exposure artifact keys and used the provided table name to source the exposure_parameter pipeline ### Testing
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,7 @@
+**4.2.0 - 06/11/25**
+
+  - Feature: Update LBWSG to use exposure data based on simulant age
+
 **4.1.1 - 05/23/25**
 
   - Feature: Update Observer to use super class get_configuration method
diff --git a/src/vivarium_public_health/risks/implementations/low_birth_weight_and_short_gestation.py b/src/vivarium_public_health/risks/implementations/low_birth_weight_and_short_gestation.py
@@ -14,33 +14,98 @@
 
 import numpy as np
 import pandas as pd
+from layered_config_tree import ConfigurationError
+from loguru import logger
 from vivarium.framework.engine import Builder
 from vivarium.framework.lifecycle import LifeCycleError
 from vivarium.framework.population import SimulantData
 from vivarium.framework.resource import Resource
 from vivarium.framework.values import Pipeline
 
 from vivarium_public_health.risks import Risk, RiskEffect
-from vivarium_public_health.risks.data_transformations import get_exposure_post_processor
+from vivarium_public_health.risks.data_transformations import (
+    get_exposure_post_processor,
+    pivot_categorical,
+)
 from vivarium_public_health.risks.distributions import PolytomousDistribution
-from vivarium_public_health.utilities import get_lookup_columns, to_snake_case
+from vivarium_public_health.utilities import EntityString, get_lookup_columns, to_snake_case
 
 CATEGORICAL = "categorical"
 BIRTH_WEIGHT = "birth_weight"
 GESTATIONAL_AGE = "gestational_age"
 
 
 class LBWSGDistribution(PolytomousDistribution):
+    @property
+    def categories(self) -> list[str]:
+        # These need to be sorted so the cumulative sum is in the correct order of categories
+        # and results are therefore reproducible and correct
+        return sorted(self.lookup_tables[self.exposure_key].value_columns)
 
     #################
     # Setup methods #
     #################
 
+    def __init__(
+        self,
+        risk: EntityString,
+        distribution_type: str,
+        exposure_data: int | float | pd.DataFrame | None = None,
+    ) -> None:
+        super().__init__(risk, distribution_type, exposure_data)
+        self.exposure_key = "birth_exposure"
+
     # noinspection PyAttributeOutsideInit
     def setup(self, builder: Builder) -> None:
         super().setup(builder)
         self.category_intervals = self.get_category_intervals(builder)
 
+    def build_all_lookup_tables(self, builder: Builder) -> None:
+        try:
+            birth_exposure_data = self.get_data(
+                builder, self.configuration["data_sources"]["birth_exposure"]
+            )
+            birth_exposure_value_columns = self.get_exposure_value_columns(
+                birth_exposure_data
+            )
+
+            if isinstance(birth_exposure_data, pd.DataFrame):
+                birth_exposure_data = pivot_categorical(
+                    builder, self.risk, birth_exposure_data, "parameter"
+                )
+
+            self.lookup_tables["birth_exposure"] = self.build_lookup_table(
+                builder, birth_exposure_data, birth_exposure_value_columns
+            )
+        except ConfigurationError:
+            logger.warning("Birth exposure data for LBWSG is missing from the simulation")
+        try:
+            super().build_all_lookup_tables(builder)
+        except ConfigurationError:
+            logger.warning("The data for LBWSG exposure is missing from the simulation.")
+
+        if (
+            "birth_exposure" not in self.lookup_tables
+            and "exposure" not in self.lookup_tables
+        ):
+            raise ConfigurationError(
+                "The LBWSG distribution requires either 'birth_exposure' or 'exposure' data to be "
+                "available in the simulation."
+            )
+
+    def get_exposure_parameter_pipeline(self, builder: Builder) -> Pipeline:
+        lookup_columns = []
+        if "exposure" in self.lookup_tables:
+            lookup_columns.extend(get_lookup_columns([self.lookup_tables["exposure"]]))
+        if "birth_exposure" in self.lookup_tables:
+            lookup_columns.extend(get_lookup_columns([self.lookup_tables["birth_exposure"]]))
+        return builder.value.register_value_producer(
+            self.parameters_pipeline_name,
+            source=lambda index: self.lookup_tables[self.exposure_key](index),
+            component=self,
+            required_resources=list(set(lookup_columns)),
+        )
+
     def get_category_intervals(self, builder: Builder) -> dict[str, dict[str, pd.Interval]]:
         """Gets the intervals for each category.
 
@@ -203,8 +268,9 @@ def get_exposure_column_name(axis: str) -> str:
     @property
     def configuration_defaults(self) -> dict[str, Any]:
         configuration_defaults = super().configuration_defaults
+        # Add birth exposure data source
         configuration_defaults[self.name]["data_sources"][
-            "exposure"
+            "birth_exposure"
         ] = f"{self.risk}.birth_exposure"
         configuration_defaults[self.name]["distribution_type"] = "lbwsg"
         return configuration_defaults
@@ -224,6 +290,7 @@ def __init__(self):
     def setup(self, builder: Builder) -> None:
         super().setup(builder)
         self.birth_exposures = self.get_birth_exposure_pipelines(builder)
+        self.configuration_age_end = builder.configuration.population.initialization_age_max
 
     #################
     # Setup methods #
@@ -242,7 +309,7 @@ def get_birth_exposure_pipelines(self, builder: Builder) -> dict[str, Pipeline]:
             self.exposure_distribution.lookup_tables.values()
         )
 
-        def get_pipeline(axis_: str):
+        def get_pipeline(axis_: str) -> Pipeline:
             return builder.value.register_value_producer(
                 self.birth_exposure_pipeline_name(axis_),
                 source=lambda index: self.get_birth_exposure(axis_, index),
@@ -260,12 +327,23 @@ def get_pipeline(axis_: str):
     ########################
 
     def on_initialize_simulants(self, pop_data: SimulantData) -> None:
-        birth_exposures = {
-            self.get_exposure_column_name(axis): self.birth_exposures[
-                self.birth_exposure_pipeline_name(axis)
-            ](pop_data.index)
-            for axis in self.AXES
-        }
+        if pop_data.user_data.get("age_end", self.configuration_age_end) == 0:
+            self.exposure_distribution.exposure_key = "birth_exposure"
+        else:
+            self.exposure_distribution.exposure_key = "exposure"
+
+        try:
+            birth_exposures = {
+                self.get_exposure_column_name(axis): self.birth_exposures[
+                    self.birth_exposure_pipeline_name(axis)
+                ](pop_data.index)
+                for axis in self.AXES
+            }
+        except KeyError:
+            raise ConfigurationError(
+                f"{self.exposure_distribution.exposure_key} data for {self.name} is missing from the "
+                "simulation. Simulants cannot be initialized."
+            )
         self.population_view.update(pd.DataFrame(birth_exposures))
 
     ##################################
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,8 +1,10 @@
-from collections.abc import Callable
+from collections.abc import Callable, Generator
 from pathlib import Path
 
 import pytest
+from _pytest.logging import LogCaptureFixture
 from layered_config_tree import LayeredConfigTree
+from loguru import logger
 from vivarium.framework.configuration import build_simulation_configuration
 from vivarium_testing_utils import FuzzyChecker
 
@@ -69,3 +71,10 @@ def fuzzy_checker() -> FuzzyChecker:
     yield checker
     test_dir = Path(__file__).resolve().parent
     checker.save_diagnostic_output(test_dir)
+
+
+@pytest.fixture
+def caplog(caplog: LogCaptureFixture) -> Generator[LogCaptureFixture, None, None]:
+    handler_id = logger.add(caplog.handler, format="{message}")
+    yield caplog
+    logger.remove(handler_id)
diff --git a/tests/risks/test_low_birth_weight_and_short_gestation.py b/tests/risks/test_low_birth_weight_and_short_gestation.py
@@ -1,6 +1,9 @@
 import numpy as np
 import pandas as pd
 import pytest
+from layered_config_tree import ConfigurationError
+from vivarium import InteractiveContext
+from vivarium.testing_utilities import TestPopulation
 
 from tests.risks.test_effect import _setup_risk_effect_simulation
 from tests.test_utilities import make_age_bins
@@ -57,11 +60,15 @@ def test_lbwsg_risk_effect_rr_pipeline(base_config, base_plugins, mock_rr_interp
     # Have to match age bins and rr data to make age intervals
     rr_data = make_categorical_data(agees)
     # Exposure data used for risk component
-    exposure = make_categorical_data(agees)
+    birth_exposure = make_categorical_data(agees)
+    exposure = birth_exposure.copy()
+    exposure.loc[exposure["value"] == 0.75, "value"] = 0.65
+    exposure.loc[exposure["value"] == 0.25, "value"] = 0.35
 
     # Add data dict to add to artifact
     data = {
-        f"{risk.name}.birth_exposure": exposure,
+        f"{risk.name}.birth_exposure": birth_exposure,
+        f"{risk.name}.exposure": exposure,
         f"{risk.name}.relative_risk": rr_data,
         f"{risk.name}.population_attributable_fraction": 0,
         f"{risk.name}.categories": categories,
@@ -81,6 +88,16 @@ def test_lbwsg_risk_effect_rr_pipeline(base_config, base_plugins, mock_rr_interp
     )
     sim = _setup_risk_effect_simulation(base_config, base_plugins, risk, lbwsg_effect, data)
     pop = sim.get_population()
+    # Verify exposure is used instead of birth_exposure since age end is 1.0
+    # Check values of pipeline match birth exposure data since age_end is 0.0
+    exposure_pipeline_values = sim.get_value(
+        "risk_factor.low_birth_weight_and_short_gestation.exposure_parameters"
+    )(pop.index)
+    assert isinstance(exposure_pipeline_values, pd.DataFrame)
+    assert "cat81" in exposure_pipeline_values.columns
+    assert "cat82" in exposure_pipeline_values.columns
+    assert (exposure_pipeline_values["cat81"] == 0.65).all()
+    assert (exposure_pipeline_values["cat82"] == 0.35).all()
 
     expected_pipeline_name = (
         f"effect_of_{lbwsg_effect.risk.name}_on_{lbwsg_effect.target.name}.relative_risk"
@@ -120,7 +137,8 @@ def map_age_groups(value):
                 assert (actual_rr == 1.0).all()
 
 
-def test_use_birth_exposure(base_config, base_plugins, mock_rr_interpolators):
+@pytest.mark.parametrize("age_end", [0.0, 1.0])
+def test_use_exposure(base_config, base_plugins, mock_rr_interpolators, age_end):
     risk = LBWSGRisk()
     lbwsg_effect = LBWSGRiskEffect("cause.test_cause.cause_specific_mortality_rate")
 
@@ -135,7 +153,7 @@ def test_use_birth_exposure(base_config, base_plugins, mock_rr_interpolators):
     # Have to match age bins and rr data to make age intervals
     rr_data = make_categorical_data(ages)
     # Format birth exposure data
-    exposure = pd.DataFrame(
+    birth_exposure = pd.DataFrame(
         {
             "sex": ["Male", "Female", "Male", "Female"],
             "year_start": [2021, 2021, 2021, 2021],
@@ -144,35 +162,123 @@ def test_use_birth_exposure(base_config, base_plugins, mock_rr_interpolators):
             "value": [0.75, 0.75, 0.25, 0.25],
         }
     )
+    exposure = birth_exposure.copy()
+    exposure["value"] = [0.65, 0.65, 0.35, 0.35]
 
     # Add data dict to add to artifact
     data = {
-        f"{risk.name}.birth_exposure": exposure,
+        f"{risk.name}.birth_exposure": birth_exposure,
+        f"{risk.name}.exposure": exposure,
         f"{risk.name}.relative_risk": rr_data,
         f"{risk.name}.population_attributable_fraction": 0,
         f"{risk.name}.categories": categories,
         f"{risk.name}.relative_risk_interpolator": mock_rr_interpolators,
     }
 
     # Only have neontal age groups
-    age_start = 0.0
-    age_end = 1.0
+    age_end = 0.0
     base_config.update(
         {
             "population": {
-                "initialization_age_start": age_start,
+                "initialization_age_start": 0.0,
                 "initialization_age_max": age_end,
-            }
+            },
         }
     )
     sim = _setup_risk_effect_simulation(base_config, base_plugins, risk, lbwsg_effect, data)
     pop = sim.get_population()
+    # Check values of pipeline match birth exposure data since age_end is 0.0
+    exposure_pipeline_values = sim.get_value(
+        "risk_factor.low_birth_weight_and_short_gestation.exposure_parameters"
+    )(pop.index)
+    assert isinstance(exposure_pipeline_values, pd.DataFrame)
+    assert "cat81" in exposure_pipeline_values.columns
+    assert "cat82" in exposure_pipeline_values.columns
+    exposure_values = {
+        0.0: {"cat81": 0.75, "cat82": 0.25},
+        1.0: {"cat81": 0.65, "cat82": 0.35},
+    }
+    assert (exposure_pipeline_values["cat81"] == exposure_values[age_end]["cat81"]).all()
+    assert (exposure_pipeline_values["cat82"] == exposure_values[age_end]["cat82"]).all()
 
     # Assert LBWSG birth exposure columns were created
     assert "birth_weight_exposure" in pop.columns
     assert "gestational_age_exposure" in pop.columns
 
 
+@pytest.mark.parametrize("exposure_key", ["birth_exposure", "exposure", "missing"])
+def test_lbwsg_exposure_data_logging(exposure_key, base_config, mocker, caplog) -> None:
+    risk = LBWSGRisk()
+
+    # Add mock data to artifact
+    # Format birth exposure data
+    exposure_data = pd.DataFrame(
+        {
+            "sex": ["Male", "Female", "Male", "Female"],
+            "year_start": [2021, 2021, 2021, 2021],
+            "year_end": [2022, 2022, 2022, 2022],
+            "parameter": ["cat81", "cat81", "cat82", "cat82"],
+            "value": [0.75, 0.75, 0.25, 0.25],
+        }
+    )
+
+    # Only have neontal age groups
+    if exposure_key == "birth_exposure":
+        age_end = 0.0
+    else:
+        age_end = 1.0
+
+    if exposure_key != "missing":
+        no_data_dict = {
+            "birth_exposure": "exposure",
+            "exposure": "birth_exposure",
+        }
+        no_data_key = no_data_dict[exposure_key]
+        override_config = {
+            "population": {
+                "initialization_age_start": 0.0,
+                "initialization_age_max": age_end,
+            },
+            risk.name: {
+                "data_sources": {
+                    exposure_key: exposure_data,
+                }
+            },
+        }
+    else:
+        override_config = {
+            "population": {
+                "initialization_age_start": 0.0,
+                "initialization_age_max": age_end,
+            },
+        }
+
+    # Patch get_category intervals so we do not need the mock artifact
+    mocker.patch(
+        "vivarium_public_health.risks.implementations.low_birth_weight_and_short_gestation.LBWSGDistribution.get_category_intervals"
+    )
+    assert not caplog.records
+    if exposure_key != "missing":
+        missing_key = "exposure" if exposure_key == "birth_exposure" else "birth_exposure"
+        sim = InteractiveContext(
+            base_config,
+            components=[TestPopulation(), risk],
+            configuration=override_config,
+        )
+        assert f"The data for LBWSG {missing_key} is missing from the simulation"
+    else:
+        with pytest.raises(
+            ConfigurationError,
+            match="The LBWSG distribution requires either 'birth_exposure' or 'exposure' data to be "
+            "available in the simulation.",
+        ):
+            InteractiveContext(
+                base_config,
+                components=[TestPopulation(), risk],
+                configuration=override_config,
+            )
+
+
 def make_categorical_data(data: pd.DataFrame) -> pd.DataFrame:
     # Takes age gropus and adds sex, years, categories, and values
     dfs = []