From 967acf40b97b093f86dbc43dcdcc03722d46f8e1 Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 19:03:26 +0530 Subject: [PATCH 1/9] Clarify model-matrix input docstrings --- CHANGELOG.md | 4 ++++ balance/utils/model_matrix.py | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bc39a0c6..61529bdfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,10 @@ appear in the final output CSV. Keep columns that are not id, weight, covariate, or outcome columns will be placed into ``ignore_columns`` during processing but are still retained and available in the output. +- **Clarified `_prepare_input_model_matrix` argument docs** + - Updated docstrings in `balance.utils.model_matrix` with + explicit descriptions for `sample`, `target`, `variables`, and `add_na` + behavior when preparing model-matrix inputs. ## Bug Fixes diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index 74e72bee6..e216e7ad6 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -285,11 +285,21 @@ def _prepare_input_model_matrix( - Add na indicator if required. Args: - sample (pd.DataFrame | Any): This can either be a DataFrame or a Sample object. TODO: add text. - target (pd.DataFrame | Any | None, optional): This can either be a DataFrame or a Sample object.. Defaults to None. - variables (List[str] | None, optional): Defaults to None. TODO: add text. - add_na (bool, optional): Defaults to True. TODO: add text. - fix_columns_names (bool, optional): Defaults to True. If to fix the column names of the DataFrame by changing special characters to '_'. + sample (pd.DataFrame | Any): Input sample data as either a DataFrame or + a ``Sample``-like object that stores the data in ``._df``. + target (pd.DataFrame | Any | None, optional): Optional target data as + either a DataFrame or a ``Sample``-like object. If provided, rows + are concatenated with sample rows for downstream matrix creation. + Defaults to None. + variables (List[str] | None, optional): Explicit variables to keep from + ``sample``/``target`` before concatenation. If None, variables are + inferred via ``choose_variables`` on the provided inputs. + add_na (bool, optional): If True, add missingness indicator columns to + the concatenated data. If False, drop rows with missing values and + preserve target-only-all-NA validation behavior. Defaults to True. + fix_columns_names (bool, optional): Defaults to True. If to fix the + column names of the DataFrame by changing special characters to + '_'. Raises: Exception: "Variable names cannot contain characters '[' or ']'" From 2cd6c4e394771d44e2c46f51b5b73b018ab7e5cf Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 22:16:43 +0530 Subject: [PATCH 2/9] Update docstring --- balance/utils/model_matrix.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index e216e7ad6..40f05266b 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -286,23 +286,29 @@ def _prepare_input_model_matrix( Args: sample (pd.DataFrame | Any): Input sample data as either a DataFrame or - a ``Sample``-like object that stores the data in ``._df``. + a ``Sample``-like object that stores the underlying frame in + ``._df``. target (pd.DataFrame | Any | None, optional): Optional target data as - either a DataFrame or a ``Sample``-like object. If provided, rows - are concatenated with sample rows for downstream matrix creation. - Defaults to None. - variables (List[str] | None, optional): Explicit variables to keep from - ``sample``/``target`` before concatenation. If None, variables are - inferred via ``choose_variables`` on the provided inputs. - add_na (bool, optional): If True, add missingness indicator columns to - the concatenated data. If False, drop rows with missing values and - preserve target-only-all-NA validation behavior. Defaults to True. - fix_columns_names (bool, optional): Defaults to True. If to fix the - column names of the DataFrame by changing special characters to - '_'. + either a DataFrame or a ``Sample``-like object. If provided, the + model-matrix inputs are prepared from a sample/target union of + variables and rows. Defaults to None. + variables (List[str] | None, optional): Variables to use from both + inputs. If provided, `choose_variables` validates that each + requested variable exists in both sample and target (when target is + supplied), otherwise it raises ``ValueError``. If None, variables + are inferred by `choose_variables`. + add_na (bool, optional): If True, add NA indicator columns before + model-matrix creation. If False, drop rows containing missing + values; this can raise ``ValueError`` if dropping rows empties the + sample or target. Defaults to True. + fix_columns_names (bool, optional): Whether to sanitize column names by + replacing non-word characters with ``_`` and making duplicate names + unique. Defaults to True. Raises: - Exception: "Variable names cannot contain characters '[' or ']'" + ValueError: If requested ``variables`` are not present in both inputs, + if variables contain ``[`` or ``]``, or if ``add_na=False`` drops + all rows from sample/target. Returns: Dict[str, Any]: returns a dictionary containing two keys: 'all_data' and 'sample_n'. From e14a7f873f97c431242668deca11bf3f4c93b802 Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 22:22:59 +0530 Subject: [PATCH 3/9] Update docstring --- balance/utils/model_matrix.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index 40f05266b..9811d485f 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -295,8 +295,10 @@ def _prepare_input_model_matrix( variables (List[str] | None, optional): Variables to use from both inputs. If provided, `choose_variables` validates that each requested variable exists in both sample and target (when target is - supplied), otherwise it raises ``ValueError``. If None, variables - are inferred by `choose_variables`. + supplied), otherwise it raises ``ValueError``. For ``Sample`` + inputs, this validation/inference is based on covariate names + (``sample.covars().names()``), not all raw ``._df`` columns. If + None, variables are inferred by `choose_variables`. add_na (bool, optional): If True, add NA indicator columns before model-matrix creation. If False, drop rows containing missing values; this can raise ``ValueError`` if dropping rows empties the From b9876f5d7db8ae87f10ed8fea7e199a7b2ba80b9 Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 22:35:19 +0530 Subject: [PATCH 4/9] Update docstring --- balance/utils/model_matrix.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index 9811d485f..82e55654b 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -308,9 +308,10 @@ def _prepare_input_model_matrix( unique. Defaults to True. Raises: - ValueError: If requested ``variables`` are not present in both inputs, - if variables contain ``[`` or ``]``, or if ``add_na=False`` drops - all rows from sample/target. + ValueError: If requested ``variables`` are not present in the + provided input frame(s) (and in both sample and target when target + is supplied), if variables contain ``[`` or ``]``, or if + ``add_na=False`` drops all rows from sample/target. Returns: Dict[str, Any]: returns a dictionary containing two keys: 'all_data' and 'sample_n'. From 7ed3243b777bb9682b0c31392004fe49b4558898 Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 22:51:03 +0530 Subject: [PATCH 5/9] Update docstring --- balance/utils/model_matrix.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index 82e55654b..d679a4fa5 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -285,11 +285,11 @@ def _prepare_input_model_matrix( - Add na indicator if required. Args: - sample (pd.DataFrame | Any): Input sample data as either a DataFrame or - a ``Sample``-like object that stores the underlying frame in - ``._df``. + sample (pd.DataFrame | Any): Input sample data as either a + ``pandas.DataFrame`` or a ``Sample`` object from + ``balance.sample_class`` (recognized via ``_isinstance_sample``). target (pd.DataFrame | Any | None, optional): Optional target data as - either a DataFrame or a ``Sample``-like object. If provided, the + either a ``DataFrame`` or a ``Sample`` object. If provided, the model-matrix inputs are prepared from a sample/target union of variables and rows. Defaults to None. variables (List[str] | None, optional): Variables to use from both @@ -311,7 +311,8 @@ def _prepare_input_model_matrix( ValueError: If requested ``variables`` are not present in the provided input frame(s) (and in both sample and target when target is supplied), if variables contain ``[`` or ``]``, or if - ``add_na=False`` drops all rows from sample/target. + ``add_na=False`` drops all rows from sample/target, or if + sample has zero rows. Returns: Dict[str, Any]: returns a dictionary containing two keys: 'all_data' and 'sample_n'. From ce2007aefe9636c0260222a486ec23f47a4ba4bd Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 23:05:16 +0530 Subject: [PATCH 6/9] Implement suggestions --- balance/utils/model_matrix.py | 3 ++- tests/test_util_model_matrix.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index d679a4fa5..d4a32dc5a 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -332,7 +332,8 @@ def _prepare_input_model_matrix( sample_df = sample._df else: sample_df = sample - assert sample_df.shape[0] > 0, "sample must have more than zero rows" + if sample_df.shape[0] == 0: + raise ValueError("sample must have more than zero rows") # NOTE: .copy() not needed as it is copied anyway in _concat_frames sample_n = sample_df.shape[0] sample_df = sample_df.loc[:, variables] diff --git a/tests/test_util_model_matrix.py b/tests/test_util_model_matrix.py index 05b7a05f4..05e4642ac 100644 --- a/tests/test_util_model_matrix.py +++ b/tests/test_util_model_matrix.py @@ -292,7 +292,7 @@ def test_model_matrix(self) -> None: # Test zero rows warning: self.assertRaisesRegex( - AssertionError, + ValueError, "sample must have more than zero rows", model_matrix, pd.DataFrame(), From 9478be518d11bb8d8c05b440346513617afaa72a Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 23:13:14 +0530 Subject: [PATCH 7/9] Update changelog --- CHANGELOG.md | 5 +++++ tests/test_util_model_matrix.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 61529bdfe..e59f4ede0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -61,6 +61,11 @@ like `a`, `a_1`, and repeated `a` names appear together. - Duplicate columns are now renamed deterministically to guaranteed-unique names, preventing downstream clashes after formula sanitization. +- **`model_matrix` empty-sample errors now raise `ValueError`** + - `_prepare_input_model_matrix()` now raises a deterministic `ValueError` + when the input sample has zero rows, instead of relying on an assertion. + - This aligns runtime behavior with documented exceptions and avoids + optimization-dependent assert behavior. ## Tests diff --git a/tests/test_util_model_matrix.py b/tests/test_util_model_matrix.py index 05e4642ac..03b859f09 100644 --- a/tests/test_util_model_matrix.py +++ b/tests/test_util_model_matrix.py @@ -290,7 +290,7 @@ def test_model_matrix(self) -> None: t, ) - # Test zero rows warning: + # Test zero rows error: self.assertRaisesRegex( ValueError, "sample must have more than zero rows", From 78a6d37bea3dc993b0c0ca38b23bec908a0b28cf Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 23:18:48 +0530 Subject: [PATCH 8/9] Use DataFrame.empty for empty-sample check --- balance/utils/model_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index d4a32dc5a..6f4a8d1ba 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -332,7 +332,7 @@ def _prepare_input_model_matrix( sample_df = sample._df else: sample_df = sample - if sample_df.shape[0] == 0: + if sample_df.empty: raise ValueError("sample must have more than zero rows") # NOTE: .copy() not needed as it is copied anyway in _concat_frames sample_n = sample_df.shape[0] From f5d40254c9ca9c5fb72b55fc5d7b1668d7bac9d2 Mon Sep 17 00:00:00 2001 From: Soumyadip Sarkar Date: Sun, 1 Mar 2026 23:23:28 +0530 Subject: [PATCH 9/9] Revert sample check --- balance/utils/model_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py index 6f4a8d1ba..d4a32dc5a 100644 --- a/balance/utils/model_matrix.py +++ b/balance/utils/model_matrix.py @@ -332,7 +332,7 @@ def _prepare_input_model_matrix( sample_df = sample._df else: sample_df = sample - if sample_df.empty: + if sample_df.shape[0] == 0: raise ValueError("sample must have more than zero rows") # NOTE: .copy() not needed as it is copied anyway in _concat_frames sample_n = sample_df.shape[0]