Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
c6d5237
increment version to 0.5.0-dev
NathanielBlairStahn Oct 15, 2025
6f1f093
add print_memory_usage and constant_categorical functions
NathanielBlairStahn Oct 17, 2025
32e9842
use .reindex in difference function to preserve Categoricals in index
NathanielBlairStahn Oct 17, 2025
58252d9
fix bug with unspecified dtype in constant_categorical
NathanielBlairStahn Oct 17, 2025
98f6f81
use constant categoricals for numerator and denominator measures
NathanielBlairStahn Oct 17, 2025
543ce19
allow passing the name of the column containing transition names in e…
NathanielBlairStahn Oct 17, 2025
a8aec29
add prefilter_query parameter to ratio function
NathanielBlairStahn Oct 17, 2025
2da907b
allow passing a VPHOperator object to VPHResults constructor
NathanielBlairStahn Oct 19, 2025
c167d82
re-order parameters of ratio function
NathanielBlairStahn Oct 25, 2025
a8230ec
add convert_to_categorical function
NathanielBlairStahn Oct 25, 2025
a46d3be
add location_col parameter to VPHOperator
NathanielBlairStahn Oct 25, 2025
ba275ad
add location_col parameter to VPHResults
NathanielBlairStahn Oct 25, 2025
f040c20
reorder cases; add 'measure' parameter to ratio function
NathanielBlairStahn Oct 27, 2025
f9190d9
use numpy array instead of list in constant_categorical
NathanielBlairStahn Oct 29, 2025
e121e98
add lower and upper functions
NathanielBlairStahn Nov 1, 2025
7c7d30d
write summarize_draws function as a generalized version of describe
NathanielBlairStahn Nov 1, 2025
3518d1c
change cause column to 'entity' in get_burden
NathanielBlairStahn Nov 1, 2025
04fbe48
group together functions dealing with categoricals
NathanielBlairStahn Nov 8, 2025
ae31502
fix typos in comments in averted function
NathanielBlairStahn Nov 8, 2025
47f3fbb
check whether location column is in index before adding it
NathanielBlairStahn Nov 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/vivarium_helpers/__about__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
__summary__ = "Tools maintained by the research side of the IHME simluation science team."
__uri__ = "https://github.com/ihmeuw/vivarium_helpers"

__version__ = "0.4.2"
__version__ = "0.5.0-dev"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should get this set up with https://pypi.org/project/setuptools-scm/

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I bet this is done by #11 😄


__author__ = "Nathaniel Blair-Stahn, Beatrix Haddock"
__email__ = "beatrixh@uw.com"
64 changes: 64 additions & 0 deletions src/vivarium_helpers/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import collections
import re
import pandas as pd
import numpy as np

class FrozenAttributeMapping(collections.abc.Mapping):
"""Implementation of the Mapping abstract base class that
Expand Down Expand Up @@ -138,13 +139,72 @@ def column_to_ordered_categorical(
else:
return df.assign(**{colname: categorical})

def convert_to_categorical(
df,
include_cols=(),
exclude_cols=(),
exclude_dtypes=('float', 'category'),
inplace=False
):
"""Convert all columns to pandas Categoricals except those with
dtypes listed in `exclude_dtypes` (default 'float' and 'category')
or those with names listed in `exclude_cols`. One can force a
specific column to be converted, even it its dtype is listed in
`exclude_dtypes`, by passing the column name to `include_cols`. It
is not allowed to pass a column name both to `include_cols` and
`exclude_cols`. Dtype conversion is performed in place if
`inplace=True`.

This method can save lots of memory, allowing loading and
manipulating larger DataFrames.
"""
if len(set(include_cols).intersection(exclude_cols)) != 0:
raise ValueError("A column can't be both included and excluded!")
if not inplace:
df = df.copy()
for col in df:
if (
col in include_cols
or (col not in exclude_cols
and df[col].dtype not in exclude_dtypes)
):
df[col] = df[col].astype('category')
if not inplace:
return df
else:
return None

def constant_categorical(value, length, dtype=None):
"""Create a pandas Categorical of the specified length with all
values equal to `value`. Creates the Categorical directly from codes
to avoid the unnecessarily large memory usage of creating a constant
list or Series of `value` first.
"""
if dtype != 'category':
# If a non-categorical dtype was passed, use a dtype with a
# single category when creating the Categorical from codes
dtype = pd.CategoricalDtype([value])
# Get the category code corresponding to value
code = dtype.categories.get_loc(value)
# NOTE: This could be made even more memory efficient by creating a
# NumPy array with an integer dtype of the minimum necessary size --
# that would require computing the minimum number of bits necessary
# to represent the integers 0, 1, ..., len(dtype.categories) - 1.
return pd.Categorical.from_codes(np.full(length, code), dtype=dtype)

def get_mean_lower_upper(described_data, colname_mapper={'mean':'mean', '2.5%':'lower', '97.5%':'upper'}):
"""
Gets the mean, lower, and upper value from `described_data` DataFrame, which is assumed to have
the format resulting from a call to DataFrame.describe().
"""
return described_data[colname_mapper.keys()].rename(columns=colname_mapper).reset_index()

def lower(x, rank=0.025):
return x.quantile(rank)

def upper(x, rank=0.975):
return x.quantile(rank)

# Alternative strategy to the get_mean_lower_upper function above
def aggregate_mean_lower_upper(df_or_groupby, lower_rank=0.025, upper_rank=0.975):
"""Get mean, lower, and upper from a DataFrame or GroupBy object."""
Expand All @@ -157,3 +217,7 @@ def aggregate_with_join(strings, sep='|'):
sep.join(strings).
"""
return sep.join(strings)

def print_memory_usage(df, label=''):
"""Print the memory usage of a dataframe in megabytes."""
print(df.memory_usage(deep=True).sum() / 1e6, 'MB', label)
4 changes: 2 additions & 2 deletions src/vivarium_helpers/vph_output/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def split_measure_and_transition_columns(transition_df):
.assign(measure='transition_count') # Name the measure 'transition_count' rather than 'event_count'
)

def extract_transition_states(transition_df):
def extract_transition_states(transition_df, transition_name_col='transition'):
"""Gets the 'from state' and 'to state' from the transitions in a transition count dataframe,
after the transition has been put in its own 'transition' column by the `split_measure_and_transition_columns`
function.
Expand All @@ -33,7 +33,7 @@ def extract_transition_states(transition_df):
# Renaming the 'susceptible_to' states is a hack to deal with the fact there's not a unique string
# separating the 'from' and 'to' states -- it should be '__to__' instead of '_to_' or something
states_df = (
transition_df['transition']
transition_df[transition_name_col]
.str.replace("susceptible_to", "without") # Remove word 'to' from all states so we can split transitions on '_to_'
.str.extract(states_from_transition_pattern) # Create dataframe with 'from_state' and 'to_state' columns
.apply(lambda col: col.str.replace("without", "susceptible_to")) # Restore original state names
Expand Down
15 changes: 11 additions & 4 deletions src/vivarium_helpers/vph_output/measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,24 @@ def __init__(
self,
mapping=(),
/,
ops: VPHOperator|None = None,
value_col=None,
draw_col=None,
scenario_col=None,
measure_col=None,
location_col: str|bool=False,
index_cols=None,
# record_dalys=True,
**kwargs,
):
super().__init__(mapping, **kwargs)
self.ops = VPHOperator(
self.ops = ops or VPHOperator(
value_col,
draw_col,
scenario_col,
measure_col,
index_cols
location_col,
index_cols,
)
self._clean_vph_output()
# if record_dalys:
Expand Down Expand Up @@ -104,12 +107,16 @@ def get_burden(self, measures=None):
if 'dalys' in measures and 'dalys' not in table_names:
ylls = burden.query("measure == 'ylls'")
ylds = burden.query("measure == 'ylds'")
# FIXME: Define the cause column somewhere else that makes
# sense, and deal with different column names in older
# Vivarium models
cause_col = 'entity' # Used to be 'cause'
Comment on lines +110 to +113
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like at the least we could make this an argument to the function?

# If we have comorbidity-adjusted all-cause YLDs, ensure
# that we also have all-cause YLLs so all-cause DALYs will
# be correct
if 'all_causes' in np.setdiff1d(ylds['cause'], ylls['cause']):
if 'all_causes' in np.setdiff1d(ylds[cause_col], ylls[cause_col]):
all_cause_ylls = self.ops.marginalize(
ylls.assign(cause='all_causes'), [])
ylls.assign(**{cause_col: 'all_causes'}), [])
burden = pd.concat([burden, all_cause_ylls])
# print('yll causes:', ylls.cause.unique())
burden = self.ops.aggregate_categories(
Expand Down
Loading