From 967acf40b97b093f86dbc43dcdcc03722d46f8e1 Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 19:03:26 +0530
Subject: [PATCH 1/9] Clarify model-matrix input docstrings

---
 CHANGELOG.md                  |  4 ++++
 balance/utils/model_matrix.py | 20 +++++++++++++++-----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2bc39a0c6..61529bdfe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,10 @@
     appear in the final output CSV. Keep columns that are not id, weight,
     covariate, or outcome columns will be placed into ``ignore_columns`` during
     processing but are still retained and available in the output.
+- **Clarified `_prepare_input_model_matrix` argument docs**
+  - Updated docstrings in `balance.utils.model_matrix` with
+    explicit descriptions for `sample`, `target`, `variables`, and `add_na`
+    behavior when preparing model-matrix inputs.
 
 ## Bug Fixes
 
diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index 74e72bee6..e216e7ad6 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -285,11 +285,21 @@ def _prepare_input_model_matrix(
         - Add na indicator if required.
 
     Args:
-        sample (pd.DataFrame | Any): This can either be a DataFrame or a Sample object. TODO: add text.
-        target (pd.DataFrame | Any | None, optional): This can either be a DataFrame or a Sample object.. Defaults to None.
-        variables (List[str] | None, optional): Defaults to None. TODO: add text.
-        add_na (bool, optional): Defaults to True. TODO: add text.
-        fix_columns_names (bool, optional): Defaults to True. If to fix the column names of the DataFrame by changing special characters to '_'.
+        sample (pd.DataFrame | Any): Input sample data as either a DataFrame or
+            a ``Sample``-like object that stores the data in ``._df``.
+        target (pd.DataFrame | Any | None, optional): Optional target data as
+            either a DataFrame or a ``Sample``-like object. If provided, rows
+            are concatenated with sample rows for downstream matrix creation.
+            Defaults to None.
+        variables (List[str] | None, optional): Explicit variables to keep from
+            ``sample``/``target`` before concatenation. If None, variables are
+            inferred via ``choose_variables`` on the provided inputs.
+        add_na (bool, optional): If True, add missingness indicator columns to
+            the concatenated data. If False, drop rows with missing values and
+            preserve target-only-all-NA validation behavior. Defaults to True.
+        fix_columns_names (bool, optional): Defaults to True. If to fix the
+            column names of the DataFrame by changing special characters to
+            '_'.
 
     Raises:
         Exception: "Variable names cannot contain characters '[' or ']'"

From 2cd6c4e394771d44e2c46f51b5b73b018ab7e5cf Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 22:16:43 +0530
Subject: [PATCH 2/9] Update docstring

---
 balance/utils/model_matrix.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index e216e7ad6..40f05266b 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -286,23 +286,29 @@ def _prepare_input_model_matrix(
 
     Args:
         sample (pd.DataFrame | Any): Input sample data as either a DataFrame or
-            a ``Sample``-like object that stores the data in ``._df``.
+            a ``Sample``-like object that stores the underlying frame in
+            ``._df``.
         target (pd.DataFrame | Any | None, optional): Optional target data as
-            either a DataFrame or a ``Sample``-like object. If provided, rows
-            are concatenated with sample rows for downstream matrix creation.
-            Defaults to None.
-        variables (List[str] | None, optional): Explicit variables to keep from
-            ``sample``/``target`` before concatenation. If None, variables are
-            inferred via ``choose_variables`` on the provided inputs.
-        add_na (bool, optional): If True, add missingness indicator columns to
-            the concatenated data. If False, drop rows with missing values and
-            preserve target-only-all-NA validation behavior. Defaults to True.
-        fix_columns_names (bool, optional): Defaults to True. If to fix the
-            column names of the DataFrame by changing special characters to
-            '_'.
+            either a DataFrame or a ``Sample``-like object. If provided, the
+            model-matrix inputs are prepared from a sample/target union of
+            variables and rows. Defaults to None.
+        variables (List[str] | None, optional): Variables to use from both
+            inputs. If provided, `choose_variables` validates that each
+            requested variable exists in both sample and target (when target is
+            supplied), otherwise it raises ``ValueError``. If None, variables
+            are inferred by `choose_variables`.
+        add_na (bool, optional): If True, add NA indicator columns before
+            model-matrix creation. If False, drop rows containing missing
+            values; this can raise ``ValueError`` if dropping rows empties the
+            sample or target. Defaults to True.
+        fix_columns_names (bool, optional): Whether to sanitize column names by
+            replacing non-word characters with ``_`` and making duplicate names
+            unique. Defaults to True.
 
     Raises:
-        Exception: "Variable names cannot contain characters '[' or ']'"
+        ValueError: If requested ``variables`` are not present in both inputs,
+            if variables contain ``[`` or ``]``, or if ``add_na=False`` drops
+            all rows from sample/target.
 
     Returns:
         Dict[str, Any]: returns a dictionary containing two keys: 'all_data' and 'sample_n'.

From e14a7f873f97c431242668deca11bf3f4c93b802 Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 22:22:59 +0530
Subject: [PATCH 3/9] Update docstring

---
 balance/utils/model_matrix.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index 40f05266b..9811d485f 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -295,8 +295,10 @@ def _prepare_input_model_matrix(
         variables (List[str] | None, optional): Variables to use from both
             inputs. If provided, `choose_variables` validates that each
             requested variable exists in both sample and target (when target is
-            supplied), otherwise it raises ``ValueError``. If None, variables
-            are inferred by `choose_variables`.
+            supplied), otherwise it raises ``ValueError``. For ``Sample``
+            inputs, this validation/inference is based on covariate names
+            (``sample.covars().names()``), not all raw ``._df`` columns. If
+            None, variables are inferred by `choose_variables`.
         add_na (bool, optional): If True, add NA indicator columns before
             model-matrix creation. If False, drop rows containing missing
             values; this can raise ``ValueError`` if dropping rows empties the

From b9876f5d7db8ae87f10ed8fea7e199a7b2ba80b9 Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 22:35:19 +0530
Subject: [PATCH 4/9] Update docstring

---
 balance/utils/model_matrix.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index 9811d485f..82e55654b 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -308,9 +308,10 @@ def _prepare_input_model_matrix(
             unique. Defaults to True.
 
     Raises:
-        ValueError: If requested ``variables`` are not present in both inputs,
-            if variables contain ``[`` or ``]``, or if ``add_na=False`` drops
-            all rows from sample/target.
+        ValueError: If requested ``variables`` are not present in the
+            provided input frame(s) (and in both sample and target when target
+            is supplied), if variables contain ``[`` or ``]``, or if
+            ``add_na=False`` drops all rows from sample/target.
 
     Returns:
         Dict[str, Any]: returns a dictionary containing two keys: 'all_data' and 'sample_n'.

From 7ed3243b777bb9682b0c31392004fe49b4558898 Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 22:51:03 +0530
Subject: [PATCH 5/9] Update docstring

---
 balance/utils/model_matrix.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index 82e55654b..d679a4fa5 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -285,11 +285,11 @@ def _prepare_input_model_matrix(
         - Add na indicator if required.
 
     Args:
-        sample (pd.DataFrame | Any): Input sample data as either a DataFrame or
-            a ``Sample``-like object that stores the underlying frame in
-            ``._df``.
+        sample (pd.DataFrame | Any): Input sample data as either a
+            ``pandas.DataFrame`` or a ``Sample`` object from
+            ``balance.sample_class`` (recognized via ``_isinstance_sample``).
         target (pd.DataFrame | Any | None, optional): Optional target data as
-            either a DataFrame or a ``Sample``-like object. If provided, the
+            either a ``DataFrame`` or a ``Sample`` object. If provided, the
             model-matrix inputs are prepared from a sample/target union of
             variables and rows. Defaults to None.
         variables (List[str] | None, optional): Variables to use from both
@@ -311,7 +311,8 @@ def _prepare_input_model_matrix(
         ValueError: If requested ``variables`` are not present in the
             provided input frame(s) (and in both sample and target when target
             is supplied), if variables contain ``[`` or ``]``, or if
-            ``add_na=False`` drops all rows from sample/target.
+            ``add_na=False`` drops all rows from sample/target, or if
+            sample has zero rows.
 
     Returns:
         Dict[str, Any]: returns a dictionary containing two keys: 'all_data' and 'sample_n'.

From ce2007aefe9636c0260222a486ec23f47a4ba4bd Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 23:05:16 +0530
Subject: [PATCH 6/9] Implement suggestions

---
 balance/utils/model_matrix.py   | 3 ++-
 tests/test_util_model_matrix.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index d679a4fa5..d4a32dc5a 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -332,7 +332,8 @@ def _prepare_input_model_matrix(
         sample_df = sample._df
     else:
         sample_df = sample
-    assert sample_df.shape[0] > 0, "sample must have more than zero rows"
+    if sample_df.shape[0] == 0:
+        raise ValueError("sample must have more than zero rows")
     # NOTE: .copy() not needed as it is copied anyway in _concat_frames
     sample_n = sample_df.shape[0]
     sample_df = sample_df.loc[:, variables]
diff --git a/tests/test_util_model_matrix.py b/tests/test_util_model_matrix.py
index 05b7a05f4..05e4642ac 100644
--- a/tests/test_util_model_matrix.py
+++ b/tests/test_util_model_matrix.py
@@ -292,7 +292,7 @@ def test_model_matrix(self) -> None:
 
         # Test zero rows warning:
         self.assertRaisesRegex(
-            AssertionError,
+            ValueError,
             "sample must have more than zero rows",
             model_matrix,
             pd.DataFrame(),

From 9478be518d11bb8d8c05b440346513617afaa72a Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 23:13:14 +0530
Subject: [PATCH 7/9] Update changelog

---
 CHANGELOG.md                    | 5 +++++
 tests/test_util_model_matrix.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 61529bdfe..e59f4ede0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -61,6 +61,11 @@
     like `a`, `a_1`, and repeated `a` names appear together.
   - Duplicate columns are now renamed deterministically to guaranteed-unique
     names, preventing downstream clashes after formula sanitization.
+- **`model_matrix` empty-sample errors now raise `ValueError`**
+  - `_prepare_input_model_matrix()` now raises a deterministic `ValueError`
+    when the input sample has zero rows, instead of relying on an assertion.
+  - This aligns runtime behavior with documented exceptions and avoids
+    optimization-dependent assert behavior.
 
 ## Tests
 
diff --git a/tests/test_util_model_matrix.py b/tests/test_util_model_matrix.py
index 05e4642ac..03b859f09 100644
--- a/tests/test_util_model_matrix.py
+++ b/tests/test_util_model_matrix.py
@@ -290,7 +290,7 @@ def test_model_matrix(self) -> None:
             t,
         )
 
-        # Test zero rows warning:
+        # Test zero rows error:
         self.assertRaisesRegex(
             ValueError,
             "sample must have more than zero rows",

From 78a6d37bea3dc993b0c0ca38b23bec908a0b28cf Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 23:18:48 +0530
Subject: [PATCH 8/9] Use DataFrame.empty for empty-sample check

---
 balance/utils/model_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index d4a32dc5a..6f4a8d1ba 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -332,7 +332,7 @@ def _prepare_input_model_matrix(
         sample_df = sample._df
     else:
         sample_df = sample
-    if sample_df.shape[0] == 0:
+    if sample_df.empty:
         raise ValueError("sample must have more than zero rows")
     # NOTE: .copy() not needed as it is copied anyway in _concat_frames
     sample_n = sample_df.shape[0]

From f5d40254c9ca9c5fb72b55fc5d7b1668d7bac9d2 Mon Sep 17 00:00:00 2001
From: Soumyadip Sarkar <soumya.papanvk18@gmail.com>
Date: Sun, 1 Mar 2026 23:23:28 +0530
Subject: [PATCH 9/9] Revert sample check

---
 balance/utils/model_matrix.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/balance/utils/model_matrix.py b/balance/utils/model_matrix.py
index 6f4a8d1ba..d4a32dc5a 100644
--- a/balance/utils/model_matrix.py
+++ b/balance/utils/model_matrix.py
@@ -332,7 +332,7 @@ def _prepare_input_model_matrix(
         sample_df = sample._df
     else:
         sample_df = sample
-    if sample_df.empty:
+    if sample_df.shape[0] == 0:
         raise ValueError("sample must have more than zero rows")
     # NOTE: .copy() not needed as it is copied anyway in _concat_frames
     sample_n = sample_df.shape[0]