facebookresearch · neuralsorcerer · Mar 4, 2026 · Mar 5, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,14 @@
     `comparative=False` to use grouped-bar histograms (`ascii_plot_hist`) for
     numeric variables instead, matching the style used for categorical variables.
 
+## Code Quality & Refactoring
+
+- **Moved dataset loading implementations out of `balance.datasets.__init__`**
+  - Refactored `load_sim_data`, `load_cbps_data`, and `load_data` into
+    `balance.datasets.loading_data` and re-exported them from
+    `balance.datasets` to preserve the public API while keeping module
+    responsibilities focused.
+
 ## Documentation
 
 - **ASCII plot docstring examples and `library="balance"` docs**

diff --git a/balance/datasets/__init__.py b/balance/datasets/__init__.py
@@ -5,194 +5,6 @@
 
 # pyre-strict
 
-from __future__ import annotations
+from balance.datasets.loading_data import load_cbps_data, load_data, load_sim_data
 
-import pathlib
-from typing import Literal, Tuple
-
-import numpy as np
-import numpy.typing as npt
-import pandas as pd
-
-# TODO: move the functions from datasets/__init__.py to some other file (e.g.: datasets/loading_data.py),
-#       and then import the functions from that file in the init file (so the behavior would remain the same)
-
-# This module provides data loading utilities for simulated datasets used in balance package.
-# It supports loading simulation data with reproducible random seeds for testing and examples.
-
-
-# TODO: add tests
-def load_sim_data(
-    version: str = "01",
-) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
-    """Load simulated data for target and sample of interest
-
-    This function generates reproducible simulated datasets using fixed random seeds
-    to ensure consistent results across multiple calls and different environments.
-
-    Version 01 returns two dataframes containing the columns gender ("Male", "Female" and nan),
-    age_group ("18-24", "25-34", "35-44", "45+"), income (some numbers from a normal distribution), and id.
-    The sample_df also has a column called happiness with a value from 0 to 100 that depends on the covariates.
-
-    The target_df DataFrame has 10000 rows, and sample_df has 1000 rows.
-
-    The sample_df is imbalanced when compared to target_df, as is demonstrated in the examples/tutorials.
-
-    If you want to see how this works, you can import balance and run this code:
-        import inspect
-        import balance
-        print(inspect.getsource(balance.datasets.load_sim_data))
-
-    Args:
-        version (str, optional): The version of simulated data. Currently available is only "01". Defaults to "01".
-
-    Returns:
-        Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing simulated data for the target and sample of interest.
-    """
-
-    def _create_outcome_happiness(df: pd.DataFrame, n: int) -> npt.NDArray[np.floating]:
-        # females are happier
-        # older people are happier
-        # people with higher income are happier
-        out = (
-            np.random.normal(40, 10, size=n)
-            + np.where(df.gender == "Female", 1, 0) * np.random.normal(20, 1, size=n)
-            + np.where(df.age_group == "35-44", 1, 0) * np.random.normal(5, 1, size=n)
-            + np.where(df.age_group == "45+", 1, 0) * np.random.normal(20, 1, size=n)
-            + np.random.normal((np.random.normal(3, 2, size=n) ** 2) / 20, 1, size=n)
-        )
-        # Truncate for max to be 100
-        out = np.where(out < 100, out, 100)
-        return out
-
-    if version == "01":
-        np.random.seed(2022 - 11 - 8)  # for reproducibility
-        n_target = 10000
-        target_df = pd.DataFrame(
-            {
-                "id": (np.array(range(n_target)) + 100000).astype(str),
-                "gender": np.random.choice(
-                    ["Male", "Female"], size=n_target, replace=True, p=[0.5, 0.5]
-                ),
-                "age_group": np.random.choice(
-                    ["18-24", "25-34", "35-44", "45+"],
-                    size=n_target,
-                    replace=True,
-                    p=[0.20, 0.30, 0.30, 0.20],
-                ),
-                "income": np.random.normal(3, 2, size=n_target) ** 2,
-                # "unrelated_variable": np.random.uniform(size = n_target),
-                # "weight": np.random.uniform(size = n_target) + 0.5,
-            }
-        )
-        target_df["happiness"] = _create_outcome_happiness(target_df, n_target)
-        # We also have missing values in gender
-        target_df.loc[3:900, "gender"] = np.nan
-
-        np.random.seed(2023 - 5 - 14)  # for reproducibility
-        n_sample = 1000
-        sample_df = pd.DataFrame(
-            {
-                "id": (np.array(range(n_sample))).astype(str),
-                "gender": np.random.choice(
-                    ["Male", "Female"], size=n_sample, replace=True, p=[0.7, 0.3]
-                ),
-                "age_group": np.random.choice(
-                    ["18-24", "25-34", "35-44", "45+"],
-                    size=n_sample,
-                    replace=True,
-                    p=[0.50, 0.30, 0.15, 0.05],
-                ),
-                "income": np.random.normal(2, 1.5, size=n_sample) ** 2,
-                # "unrelated_variable": np.random.uniform(size = n_sample),
-                # "weight": np.random.uniform(size = n_sample) + 0.5,
-            }
-        )
-        sample_df["happiness"] = _create_outcome_happiness(sample_df, n_sample)
-
-        # We also have missing values in gender
-        sample_df.loc[3:90, "gender"] = np.nan
-
-        return target_df, sample_df
-
-    return None, None
-
-
-# TODO: add tests
-def load_cbps_data() -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
-    """Load simulated data for CBPS comparison with R
-
-    The code in balance that implements CBPS attempts to mimic the code from the R package CBPS (https://cran.r-project.org/web/packages/CBPS/).
-
-    In the help page of the CBPS function in R (i.e.: `?CBPS::CBPS`) there is a simulated dataset that is used to showcase the CBPS function.
-    The output of that simulated dataset is saved in balance in order to allow for comparison of `balance` (Python) with `CBPS` (R).
-
-    You can view the structure of the simulated dataset by looking at the example below.
-
-    In the original simulation dataset (available in sim_data_cbps.csv), when the `treat` variable is 0, the row belongs to sample.
-    And when the `treat` variable is 1, the row belongs to target.
-
-    Returns:
-        Tuple[pd.DataFrame | None, pd.DataFrame | None]: Two DataFrames containing simulated data for the target and sample of interest.
-
-    Example:
-        ::
-            import balance
-            target_df, sample_df = balance.datasets.load_data("sim_data_cbps")
-            print(target_df.head())
-            #           X1         X2        X3          X4  cbps_weights           y  id
-            # 1   0.723769   9.911956  0.189488  383.759778      0.003937  199.817495   2
-            # 3   0.347071   9.907768  0.096706  399.366071      0.003937  174.685348   4
-            # 11  0.691174  10.725262  0.214618  398.313184      0.003937  189.578368  12
-            # 12  0.779949   9.562130  0.181408  370.178863      0.003937  208.178724  13
-            # 13  0.818348   9.801834  0.210592  434.453795      0.003937  214.277306  14
-
-            print(target_df.info())
-            # <class 'pandas.core.frame.DataFrame'>
-            # Int64Index: 254 entries, 1 to 498
-            # Data columns (total 7 columns):
-            #  #   Column        Non-Null Count  Dtype
-            # ---  ------        --------------  -----
-            #  0   X1            254 non-null    float64
-            #  1   X2            254 non-null    float64
-            #  2   X3            254 non-null    float64
-            #  3   X4            254 non-null    float64
-            #  4   cbps_weights  254 non-null    float64
-            #  5   y             254 non-null    float64
-            #  6   id            254 non-null    int64
-            # dtypes: float64(6), int64(1)
-            # memory usage: 15.9 KB
-    """
-    # NOTE: the reason we use __file__ and not importlib.resources is because the later one changed API in Python 3.11.
-    #       so in order to be compliant with 3.7-3.10 and also 3.11, using __file__ is the safer option.
-    df_all = pd.read_csv(pathlib.Path(__file__).parent.joinpath("sim_data_cbps.csv"))
-    target_df = df_all[df_all.treat == 1].drop(["treat"], axis=1)
-    sample_df = df_all[df_all.treat == 0].drop(["treat"], axis=1)
-
-    return (target_df, sample_df)
-
-
-def load_data(
-    source: Literal["sim_data_01", "sim_data_cbps"] = "sim_data_01",
-) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
-    """Returns a tuple of two DataFrames containing simulated data.
-
-    To learn more about each dataset, please refer to their help pages:
-    - sim_data_01: :func:`balance.datasets.load_sim_data`.
-    - sim_data_cbps: :func:`balance.datasets.load_cbps_data`.
-
-    Args:
-        source (Literal["sim_data_01", "sim_data_cbps"]): The name of the data to return. Defaults to "sim_data_01".
-
-    Returns:
-        Tuple[pd.DataFrame, pd.DataFrame]: The first dataframe contains simulated data of the "target" and the second dataframe contains simulated data of the "sample".
-    """
-
-    if source == "sim_data_01":
-        target_df, sample_df = load_sim_data("01")
-        return (target_df, sample_df)
-    elif source == "sim_data_cbps":
-        target_df, sample_df = load_cbps_data()
-        return (target_df, sample_df)
-
-    return None, None
+__all__ = ["load_sim_data", "load_cbps_data", "load_data"]
diff --git a/balance/datasets/loading_data.py b/balance/datasets/loading_data.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from __future__ import annotations
+
+import pathlib
+from typing import Literal, Tuple
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+
+# This module provides data loading utilities for simulated datasets used in the
+# balance package. It supports loading simulation data with reproducible random
+# seeds for testing and examples.
+
+
+def load_sim_data(
+    version: str = "01",
+) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
+    """Load simulated data for target and sample of interest.
+
+    This function generates reproducible simulated datasets using fixed random seeds
+    to ensure consistent results across multiple calls and different environments.
+
+    Version 01 returns two dataframes containing the columns gender ("Male", "Female" and nan),
+    age_group ("18-24", "25-34", "35-44", "45+"), income (some numbers from a normal distribution), and id.
+    The sample_df also has a column called happiness with a value from 0 to 100 that depends on the covariates.
+
+    The target_df DataFrame has 10000 rows, and sample_df has 1000 rows.
+
+    The sample_df is imbalanced when compared to target_df, as is demonstrated in the examples/tutorials.
+
+    If you want to see how this works, you can import balance and run this code:
+        import inspect
+        import balance
+        print(inspect.getsource(balance.datasets.load_sim_data))
+
+    Args:
+        version (str, optional): The version of simulated data. Currently available is only "01". Defaults to "01".
+
+    Returns:
+        Tuple[pd.DataFrame, pd.DataFrame]: Two DataFrames containing simulated data for the target and sample of interest.
+    """
+
+    def _create_outcome_happiness(df: pd.DataFrame, n: int) -> npt.NDArray[np.floating]:
+        # females are happier
+        # older people are happier
+        # people with higher income are happier
+        out = (
+            np.random.normal(40, 10, size=n)
+            + np.where(df.gender == "Female", 1, 0) * np.random.normal(20, 1, size=n)
+            + np.where(df.age_group == "35-44", 1, 0) * np.random.normal(5, 1, size=n)
+            + np.where(df.age_group == "45+", 1, 0) * np.random.normal(20, 1, size=n)
+            + np.random.normal((np.random.normal(3, 2, size=n) ** 2) / 20, 1, size=n)
+        )
+        # Truncate for max to be 100
+        out = np.where(out < 100, out, 100)
+        return out
+
+    if version == "01":
+        np.random.seed(2022 - 11 - 8)  # for reproducibility
+        n_target = 10000
+        target_df = pd.DataFrame(
+            {
+                "id": (np.array(range(n_target)) + 100000).astype(str),
+                "gender": np.random.choice(
+                    ["Male", "Female"], size=n_target, replace=True, p=[0.5, 0.5]
+                ),
+                "age_group": np.random.choice(
+                    ["18-24", "25-34", "35-44", "45+"],
+                    size=n_target,
+                    replace=True,
+                    p=[0.20, 0.30, 0.30, 0.20],
+                ),
+                "income": np.random.normal(3, 2, size=n_target) ** 2,
+            }
+        )
+        target_df["happiness"] = _create_outcome_happiness(target_df, n_target)
+        # We also have missing values in gender
+        target_df.loc[3:900, "gender"] = np.nan
+
+        np.random.seed(2023 - 5 - 14)  # for reproducibility
+        n_sample = 1000
+        sample_df = pd.DataFrame(
+            {
+                "id": (np.array(range(n_sample))).astype(str),
+                "gender": np.random.choice(
+                    ["Male", "Female"], size=n_sample, replace=True, p=[0.7, 0.3]
+                ),
+                "age_group": np.random.choice(
+                    ["18-24", "25-34", "35-44", "45+"],
+                    size=n_sample,
+                    replace=True,
+                    p=[0.50, 0.30, 0.15, 0.05],
+                ),
+                "income": np.random.normal(2, 1.5, size=n_sample) ** 2,
+            }
+        )
+        sample_df["happiness"] = _create_outcome_happiness(sample_df, n_sample)
+
+        # We also have missing values in gender
+        sample_df.loc[3:90, "gender"] = np.nan
+
+        return target_df, sample_df
+
+    return None, None
+
+
+def load_cbps_data() -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
+    """Load simulated data for CBPS comparison with R.
+
+    The code in balance that implements CBPS attempts to mimic the code from the R package CBPS (https://cran.r-project.org/web/packages/CBPS/).
+
+    In the help page of the CBPS function in R (i.e.: `?CBPS::CBPS`) there is a simulated dataset that is used to showcase the CBPS function.
+    The output of that simulated dataset is saved in balance in order to allow for comparison of `balance` (Python) with `CBPS` (R).
+
+    You can view the structure of the simulated dataset by looking at the example below.
+
+    In the original simulation dataset (available in sim_data_cbps.csv), when the `treat` variable is 0, the row belongs to sample.
+    And when the `treat` variable is 1, the row belongs to target.
+
+    Returns:
+        Tuple[pd.DataFrame | None, pd.DataFrame | None]: Two DataFrames containing simulated data for the target and sample of interest.
+    """
+    # NOTE: the reason we use __file__ and not importlib.resources is because the latter changed API in Python 3.11.
+    #       so in order to be compliant with 3.7-3.10 and also 3.11, using __file__ is the safer option.
+    df_all = pd.read_csv(pathlib.Path(__file__).parent.joinpath("sim_data_cbps.csv"))
+    target_df = df_all[df_all.treat == 1].drop(["treat"], axis=1)
+    sample_df = df_all[df_all.treat == 0].drop(["treat"], axis=1)
+
+    return (target_df, sample_df)
+
+
+def load_data(
+    source: Literal["sim_data_01", "sim_data_cbps"] = "sim_data_01",
+) -> Tuple[pd.DataFrame | None, pd.DataFrame | None]:
+    """Return a tuple of two DataFrames containing simulated data.
+
+    To learn more about each dataset, please refer to their help pages:
+    - sim_data_01: :func:`balance.datasets.load_sim_data`.
+    - sim_data_cbps: :func:`balance.datasets.load_cbps_data`.
+
+    Args:
+        source (Literal["sim_data_01", "sim_data_cbps"]): The name of the data to return. Defaults to "sim_data_01".
+
+    Returns:
+        Tuple[pd.DataFrame, pd.DataFrame]: The first dataframe contains simulated data of the "target" and the second dataframe contains simulated data of the "sample".
+    """
+
+    if source == "sim_data_01":
+        return load_sim_data("01")
+    if source == "sim_data_cbps":
+        return load_cbps_data()
+
+    return None, None