ihmeuw · NathanielBlairStahn · Oct 15, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/src/vivarium_helpers/__about__.py b/src/vivarium_helpers/__about__.py
@@ -7,7 +7,7 @@
 __summary__ = "Tools maintained by the research side of the IHME simluation science team."
 __uri__ = "https://github.com/ihmeuw/vivarium_helpers"
 
-__version__ = "0.4.2"
+__version__ = "0.5.0-dev"
 
 __author__ = "Nathaniel Blair-Stahn, Beatrix Haddock"
 __email__ = "beatrixh@uw.com"
diff --git a/src/vivarium_helpers/utils.py b/src/vivarium_helpers/utils.py
@@ -1,6 +1,7 @@
 import collections
 import re
 import pandas as pd
+import numpy as np
 
 class FrozenAttributeMapping(collections.abc.Mapping):
     """Implementation of the Mapping abstract base class that
@@ -138,13 +139,72 @@ def column_to_ordered_categorical(
     else:
         return df.assign(**{colname: categorical})
 
+def convert_to_categorical(
+        df,
+        include_cols=(),
+        exclude_cols=(),
+        exclude_dtypes=('float', 'category'),
+        inplace=False
+    ):
+    """Convert all columns to pandas Categoricals except those with
+    dtypes listed in `exclude_dtypes` (default 'float' and 'category')
+    or those with names listed in `exclude_cols`. One can force a
+    specific column to be converted, even it its dtype is listed in
+    `exclude_dtypes`, by passing the column name to `include_cols`. It
+    is not allowed to pass a column name both to `include_cols` and
+    `exclude_cols`. Dtype conversion is performed in place if
+    `inplace=True`.
+
+    This method can save lots of memory, allowing loading and
+    manipulating larger DataFrames.
+    """
+    if  len(set(include_cols).intersection(exclude_cols)) != 0:
+        raise ValueError("A column can't be both included and excluded!")
+    if not inplace:
+        df = df.copy()
+    for col in df:
+        if (
+            col in include_cols
+            or (col not in exclude_cols
+                and df[col].dtype not in exclude_dtypes)
+        ):
+            df[col] = df[col].astype('category')
+    if not inplace:
+        return df
+    else:
+        return None
+
+def constant_categorical(value, length, dtype=None):
+    """Create a pandas Categorical of the specified length with all
+    values equal to `value`. Creates the Categorical directly from codes
+    to avoid the unnecessarily large memory usage of creating a constant
+    list or Series of `value` first.
+    """
+    if dtype != 'category':
+        # If a non-categorical dtype was passed, use a dtype with a
+         # single category when creating the Categorical from codes
+        dtype = pd.CategoricalDtype([value])
+    # Get the category code corresponding to value
+    code = dtype.categories.get_loc(value)
+    # NOTE: This could be made even more memory efficient by creating a
+    # NumPy array with an integer dtype of the minimum necessary size --
+    # that would require computing the minimum number of bits necessary
+    # to represent the integers 0, 1, ..., len(dtype.categories) - 1.
+    return pd.Categorical.from_codes(np.full(length, code), dtype=dtype)
+
 def get_mean_lower_upper(described_data, colname_mapper={'mean':'mean', '2.5%':'lower', '97.5%':'upper'}):
         """
         Gets the mean, lower, and upper value from `described_data` DataFrame, which is assumed to have
         the format resulting from a call to DataFrame.describe().
         """
         return described_data[colname_mapper.keys()].rename(columns=colname_mapper).reset_index()
 
+def lower(x, rank=0.025):
+    return x.quantile(rank)
+
+def upper(x, rank=0.975):
+    return x.quantile(rank)
+
 # Alternative strategy to the get_mean_lower_upper function above
 def aggregate_mean_lower_upper(df_or_groupby, lower_rank=0.025, upper_rank=0.975):
     """Get mean, lower, and upper from a DataFrame or GroupBy object."""
@@ -157,3 +217,7 @@ def aggregate_with_join(strings, sep='|'):
     sep.join(strings).
     """
     return sep.join(strings)
+
+def print_memory_usage(df, label=''):
+    """Print the memory usage of a dataframe in megabytes."""
+    print(df.memory_usage(deep=True).sum() / 1e6, 'MB', label)
diff --git a/src/vivarium_helpers/vph_output/cleaning.py b/src/vivarium_helpers/vph_output/cleaning.py
@@ -24,7 +24,7 @@ def split_measure_and_transition_columns(transition_df):
             .assign(measure='transition_count') # Name the measure 'transition_count' rather than 'event_count'
            )
 
-def extract_transition_states(transition_df):
+def extract_transition_states(transition_df, transition_name_col='transition'):
     """Gets the 'from state' and 'to state' from the transitions in a transition count dataframe,
     after the transition has been put in its own 'transition' column by the `split_measure_and_transition_columns`
     function.
@@ -33,7 +33,7 @@ def extract_transition_states(transition_df):
     # Renaming the 'susceptible_to' states is a hack to deal with the fact there's not a unique string
     # separating the 'from' and 'to' states -- it should be '__to__' instead of '_to_' or something
     states_df = (
-        transition_df['transition']
+        transition_df[transition_name_col]
         .str.replace("susceptible_to", "without") # Remove word 'to' from all states so we can split transitions on '_to_'
         .str.extract(states_from_transition_pattern) # Create dataframe with 'from_state' and 'to_state' columns
         .apply(lambda col: col.str.replace("without", "susceptible_to")) # Restore original state names

diff --git a/src/vivarium_helpers/vph_output/measures.py b/src/vivarium_helpers/vph_output/measures.py
@@ -10,21 +10,24 @@ def __init__(
         self,
         mapping=(),
         /,
+        ops: VPHOperator|None = None,
         value_col=None,
         draw_col=None,
         scenario_col=None,
         measure_col=None,
+        location_col: str|bool=False,
         index_cols=None,
         # record_dalys=True,
         **kwargs,
     ):
         super().__init__(mapping, **kwargs)
-        self.ops = VPHOperator(
+        self.ops = ops or VPHOperator(
             value_col,
             draw_col,
             scenario_col,
             measure_col,
-            index_cols
+            location_col,
+            index_cols,
         )
         self._clean_vph_output()
         # if record_dalys:
@@ -104,12 +107,16 @@ def get_burden(self, measures=None):
         if 'dalys' in measures and 'dalys' not in table_names:
             ylls = burden.query("measure == 'ylls'")
             ylds = burden.query("measure == 'ylds'")
+            # FIXME: Define the cause column somewhere else that makes
+            # sense, and deal with different column names in older
+            # Vivarium models
+            cause_col = 'entity' # Used to be 'cause'
             # If we have comorbidity-adjusted all-cause YLDs, ensure
             # that we also have all-cause YLLs so all-cause DALYs will
             # be correct
-            if 'all_causes' in np.setdiff1d(ylds['cause'], ylls['cause']):
+            if 'all_causes' in np.setdiff1d(ylds[cause_col], ylls[cause_col]):
                 all_cause_ylls = self.ops.marginalize(
-                    ylls.assign(cause='all_causes'), [])
+                    ylls.assign(**{cause_col: 'all_causes'}), [])
                 burden = pd.concat([burden, all_cause_ylls])
                 # print('yll causes:', ylls.cause.unique())
             burden = self.ops.aggregate_categories(