Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
`comparative=False` to use grouped-bar histograms (`ascii_plot_hist`) for
numeric variables instead, matching the style used for categorical variables.

## Code Quality & Refactoring

- **Moved dataset loading implementations out of `balance.datasets.__init__`**
- Refactored `load_sim_data`, `load_cbps_data`, and `load_data` into
`balance.datasets.loading_data` and re-exported them from
`balance.datasets` to preserve the public API while keeping module
responsibilities focused.

## Documentation

- **ASCII plot docstring examples and `library="balance"` docs**
Expand Down
192 changes: 2 additions & 190 deletions balance/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,194 +5,6 @@

# pyre-strict

from __future__ import annotations
from balance.datasets.loading_data import load_cbps_data, load_data, load_sim_data

import pathlib
from typing import Literal, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd

# TODO: move the functions from datasets/__init__.py to some other file (e.g.: datasets/loading_data.py),
# and then import the functions from that file in the init file (so the behavior would remain the same)

# This module provides data loading utilities for simulated datasets used in balance package.
# It supports loading simulation data with reproducible random seeds for testing and examples.


# TODO: add tests
def load_sim_data(
version: str = "01",
) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
"""Load simulated data for target and sample of interest

This function generates reproducible simulated datasets using fixed random seeds
to ensure consistent results across multiple calls and different environments.

Version 01 returns two dataframes containing the columns gender ("Male", "Female" and nan),
age_group ("18-24", "25-34", "35-44", "45+"), income (some numbers from a normal distribution), and id.
The sample_df also has a column called happiness with a value from 0 to 100 that depends on the covariates.

The target_df DataFrame has 10000 rows, and sample_df has 1000 rows.

The sample_df is imbalanced when compared to target_df, as is demonstrated in the examples/tutorials.

If you want to see how this works, you can import balance and run this code:
import inspect
import balance
print(inspect.getsource(balance.datasets.load_sim_data))

Args:
version (str, optional): The version of simulated data. Currently available is only "01". Defaults to "01".

Returns:
Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing simulated data for the target and sample of interest.
"""

def _create_outcome_happiness(df: pd.DataFrame, n: int) -> npt.NDArray[np.floating]:
# females are happier
# older people are happier
# people with higher income are happier
out = (
np.random.normal(40, 10, size=n)
+ np.where(df.gender == "Female", 1, 0) * np.random.normal(20, 1, size=n)
+ np.where(df.age_group == "35-44", 1, 0) * np.random.normal(5, 1, size=n)
+ np.where(df.age_group == "45+", 1, 0) * np.random.normal(20, 1, size=n)
+ np.random.normal((np.random.normal(3, 2, size=n) ** 2) / 20, 1, size=n)
)
# Truncate for max to be 100
out = np.where(out < 100, out, 100)
return out

if version == "01":
np.random.seed(2022 - 11 - 8) # for reproducibility
n_target = 10000
target_df = pd.DataFrame(
{
"id": (np.array(range(n_target)) + 100000).astype(str),
"gender": np.random.choice(
["Male", "Female"], size=n_target, replace=True, p=[0.5, 0.5]
),
"age_group": np.random.choice(
["18-24", "25-34", "35-44", "45+"],
size=n_target,
replace=True,
p=[0.20, 0.30, 0.30, 0.20],
),
"income": np.random.normal(3, 2, size=n_target) ** 2,
# "unrelated_variable": np.random.uniform(size = n_target),
# "weight": np.random.uniform(size = n_target) + 0.5,
}
)
target_df["happiness"] = _create_outcome_happiness(target_df, n_target)
# We also have missing values in gender
target_df.loc[3:900, "gender"] = np.nan

np.random.seed(2023 - 5 - 14) # for reproducibility
n_sample = 1000
sample_df = pd.DataFrame(
{
"id": (np.array(range(n_sample))).astype(str),
"gender": np.random.choice(
["Male", "Female"], size=n_sample, replace=True, p=[0.7, 0.3]
),
"age_group": np.random.choice(
["18-24", "25-34", "35-44", "45+"],
size=n_sample,
replace=True,
p=[0.50, 0.30, 0.15, 0.05],
),
"income": np.random.normal(2, 1.5, size=n_sample) ** 2,
# "unrelated_variable": np.random.uniform(size = n_sample),
# "weight": np.random.uniform(size = n_sample) + 0.5,
}
)
sample_df["happiness"] = _create_outcome_happiness(sample_df, n_sample)

# We also have missing values in gender
sample_df.loc[3:90, "gender"] = np.nan

return target_df, sample_df

return None, None


# TODO: add tests
def load_cbps_data() -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
"""Load simulated data for CBPS comparison with R

The code in balance that implements CBPS attempts to mimic the code from the R package CBPS (https://cran.r-project.org/web/packages/CBPS/).

In the help page of the CBPS function in R (i.e.: `?CBPS::CBPS`) there is a simulated dataset that is used to showcase the CBPS function.
The output of that simulated dataset is saved in balance in order to allow for comparison of `balance` (Python) with `CBPS` (R).

You can view the structure of the simulated dataset by looking at the example below.

In the original simulation dataset (available in sim_data_cbps.csv), when the `treat` variable is 0, the row belongs to sample.
And when the `treat` variable is 1, the row belongs to target.

Returns:
Tuple[pd.DataFrame | None, pd.DataFrame | None]: Two DataFrames containing simulated data for the target and sample of interest.

Example:
::
import balance
target_df, sample_df = balance.datasets.load_data("sim_data_cbps")
print(target_df.head())
# X1 X2 X3 X4 cbps_weights y id
# 1 0.723769 9.911956 0.189488 383.759778 0.003937 199.817495 2
# 3 0.347071 9.907768 0.096706 399.366071 0.003937 174.685348 4
# 11 0.691174 10.725262 0.214618 398.313184 0.003937 189.578368 12
# 12 0.779949 9.562130 0.181408 370.178863 0.003937 208.178724 13
# 13 0.818348 9.801834 0.210592 434.453795 0.003937 214.277306 14

print(target_df.info())
# <class 'pandas.core.frame.DataFrame'>
# Int64Index: 254 entries, 1 to 498
# Data columns (total 7 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 X1 254 non-null float64
# 1 X2 254 non-null float64
# 2 X3 254 non-null float64
# 3 X4 254 non-null float64
# 4 cbps_weights 254 non-null float64
# 5 y 254 non-null float64
# 6 id 254 non-null int64
# dtypes: float64(6), int64(1)
# memory usage: 15.9 KB
"""
# NOTE: the reason we use __file__ and not importlib.resources is because the later one changed API in Python 3.11.
# so in order to be compliant with 3.7-3.10 and also 3.11, using __file__ is the safer option.
df_all = pd.read_csv(pathlib.Path(__file__).parent.joinpath("sim_data_cbps.csv"))
target_df = df_all[df_all.treat == 1].drop(["treat"], axis=1)
sample_df = df_all[df_all.treat == 0].drop(["treat"], axis=1)

return (target_df, sample_df)


def load_data(
source: Literal["sim_data_01", "sim_data_cbps"] = "sim_data_01",
) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
"""Returns a tuple of two DataFrames containing simulated data.

To learn more about each dataset, please refer to their help pages:
- sim_data_01: :func:`balance.datasets.load_sim_data`.
- sim_data_cbps: :func:`balance.datasets.load_cbps_data`.

Args:
source (Literal["sim_data_01", "sim_data_cbps"]): The name of the data to return. Defaults to "sim_data_01".

Returns:
Tuple[pd.DataFrame, pd.DataFrame]: The first dataframe contains simulated data of the "target" and the second dataframe contains simulated data of the "sample".
"""

if source == "sim_data_01":
target_df, sample_df = load_sim_data("01")
return (target_df, sample_df)
elif source == "sim_data_cbps":
target_df, sample_df = load_cbps_data()
return (target_df, sample_df)

return None, None
__all__ = ["load_sim_data", "load_cbps_data", "load_data"]
161 changes: 161 additions & 0 deletions balance/datasets/loading_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

# pyre-strict

from __future__ import annotations

import pathlib
from typing import Literal, Tuple

import numpy as np
import numpy.typing as npt
import pandas as pd


# This module provides data loading utilities for simulated datasets used in the
# balance package. It supports loading simulation data with reproducible random
# seeds for testing and examples.


def load_sim_data(
version: str = "01",
) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
"""Load simulated data for target and sample of interest.

This function generates reproducible simulated datasets using fixed random seeds
to ensure consistent results across multiple calls and different environments.

Version 01 returns two dataframes containing the columns gender ("Male", "Female" and nan),
age_group ("18-24", "25-34", "35-44", "45+"), income (some numbers from a normal distribution), and id.
The sample_df also has a column called happiness with a value from 0 to 100 that depends on the covariates.

The target_df DataFrame has 10000 rows, and sample_df has 1000 rows.

The sample_df is imbalanced when compared to target_df, as is demonstrated in the examples/tutorials.

If you want to see how this works, you can import balance and run this code:
import inspect
import balance
print(inspect.getsource(balance.datasets.load_sim_data))

Args:
version (str, optional): The version of simulated data. Currently available is only "01". Defaults to "01".

Returns:
Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing simulated data for the target and sample of interest.
"""

def _create_outcome_happiness(df: pd.DataFrame, n: int) -> npt.NDArray[np.floating]:
# females are happier
# older people are happier
# people with higher income are happier
out = (
np.random.normal(40, 10, size=n)
+ np.where(df.gender == "Female", 1, 0) * np.random.normal(20, 1, size=n)
+ np.where(df.age_group == "35-44", 1, 0) * np.random.normal(5, 1, size=n)
+ np.where(df.age_group == "45+", 1, 0) * np.random.normal(20, 1, size=n)
+ np.random.normal((np.random.normal(3, 2, size=n) ** 2) / 20, 1, size=n)
)
# Truncate for max to be 100
out = np.where(out < 100, out, 100)
return out

if version == "01":
np.random.seed(2022 - 11 - 8) # for reproducibility
n_target = 10000
target_df = pd.DataFrame(
{
"id": (np.array(range(n_target)) + 100000).astype(str),
"gender": np.random.choice(
["Male", "Female"], size=n_target, replace=True, p=[0.5, 0.5]
),
"age_group": np.random.choice(
["18-24", "25-34", "35-44", "45+"],
size=n_target,
replace=True,
p=[0.20, 0.30, 0.30, 0.20],
),
"income": np.random.normal(3, 2, size=n_target) ** 2,
}
)
target_df["happiness"] = _create_outcome_happiness(target_df, n_target)
# We also have missing values in gender
target_df.loc[3:900, "gender"] = np.nan

np.random.seed(2023 - 5 - 14) # for reproducibility
n_sample = 1000
sample_df = pd.DataFrame(
{
"id": (np.array(range(n_sample))).astype(str),
"gender": np.random.choice(
["Male", "Female"], size=n_sample, replace=True, p=[0.7, 0.3]
),
"age_group": np.random.choice(
["18-24", "25-34", "35-44", "45+"],
size=n_sample,
replace=True,
p=[0.50, 0.30, 0.15, 0.05],
),
"income": np.random.normal(2, 1.5, size=n_sample) ** 2,
}
)
sample_df["happiness"] = _create_outcome_happiness(sample_df, n_sample)

# We also have missing values in gender
sample_df.loc[3:90, "gender"] = np.nan

return target_df, sample_df

return None, None


def load_cbps_data() -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
"""Load simulated data for CBPS comparison with R.

The code in balance that implements CBPS attempts to mimic the code from the R package CBPS (https://cran.r-project.org/web/packages/CBPS/).

In the help page of the CBPS function in R (i.e.: `?CBPS::CBPS`) there is a simulated dataset that is used to showcase the CBPS function.
The output of that simulated dataset is saved in balance in order to allow for comparison of `balance` (Python) with `CBPS` (R).

You can view the structure of the simulated dataset by looking at the example below.

In the original simulation dataset (available in sim_data_cbps.csv), when the `treat` variable is 0, the row belongs to sample.
And when the `treat` variable is 1, the row belongs to target.

Returns:
Tuple[pd.DataFrame | None, pd.DataFrame | None]: Two DataFrames containing simulated data for the target and sample of interest.
"""
# NOTE: the reason we use __file__ and not importlib.resources is because the latter changed API in Python 3.11.
# so in order to be compliant with 3.7-3.10 and also 3.11, using __file__ is the safer option.
df_all = pd.read_csv(pathlib.Path(__file__).parent.joinpath("sim_data_cbps.csv"))
target_df = df_all[df_all.treat == 1].drop(["treat"], axis=1)
sample_df = df_all[df_all.treat == 0].drop(["treat"], axis=1)

return (target_df, sample_df)


def load_data(
source: Literal["sim_data_01", "sim_data_cbps"] = "sim_data_01",
) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
"""Return a tuple of two DataFrames containing simulated data.

To learn more about each dataset, please refer to their help pages:
- sim_data_01: :func:`balance.datasets.load_sim_data`.
- sim_data_cbps: :func:`balance.datasets.load_cbps_data`.

Args:
source (Literal["sim_data_01", "sim_data_cbps"]): The name of the data to return. Defaults to "sim_data_01".

Returns:
Tuple[pd.DataFrame, pd.DataFrame]: The first dataframe contains simulated data of the "target" and the second dataframe contains simulated data of the "sample".
"""

if source == "sim_data_01":
return load_sim_data("01")
if source == "sim_data_cbps":
return load_cbps_data()

return None, None