Implement add_phenotypes_to_df for predictions on new variants (#181)

jgallowa07 · claude · web-flow · commit a6cba80227e1 · 2025-12-16T15:39:53.000-07:00
* Implement add_phenotypes_to_df for predictions on new variants (#173) This PR implements Model.add_phenotypes_to_df() to enable predictions on new variant data not seen during training, addressing issue #173. **Core Implementation:** - Implemented add_phenotypes_to_df() method in Model class - Converts input DataFrames to jaxmodels.Data format for predictions - Handles substitution conversion to reference frame - Validates mutations and raises informative errors for unseen mutations - Preserves all input DataFrame columns in output **Verbosity Control (Bonus Feature):** - Added verbose parameter to jaxmodels.fit() and Model.fit() - Enables silent fitting for doctests and automated workflows - Wrapped all progress print statements with verbose checks **Testing:** - Added 13 comprehensive unit tests covering all functionality - Includes explicit parameter validation test - All 36 model tests pass (13 new) - Includes working doctest example **Code Quality:** - Ruff linting: ✓ All checks passed - Black formatting: ✓ All checks passed - Full test coverage with edge cases Files changed: - multidms/model.py: Core implementation (+167 lines) - multidms/jaxmodels.py: Verbose parameter (+77 lines) - tests/test_model.py: Comprehensive tests (+272 lines) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> * Fix Black action failing on Python 3.9 Pin psf/black action to v24.10.0 instead of @stable to fix TypeError with union type syntax (str | None) on Python 3.9. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/.github/workflows/build_test_package.yml b/.github/workflows/build_test_package.yml
@@ -44,7 +44,7 @@ jobs:
             pip install -e ".[dev]"
       
       - name: Black Format Check
-        uses: psf/black@stable
+        uses: psf/black@24.10.0
         with:
           options: "--check"
           src: "."
diff --git a/multidms/jaxmodels.py b/multidms/jaxmodels.py
@@ -371,6 +371,7 @@ def fit(
     beta_init: dict[str, Float[Array, " n_mutations"]] | None = None,
     alpha_init: dict[str, Float] | None = None,
     beta_clip_range: tuple[Float, Float] | None = None,
+    verbose: bool = True,
 ) -> tuple[Model, list[float]]:
     r"""
     Fit a model to data.
@@ -406,6 +407,8 @@ def fit(
                         If None, no clipping is applied. Example: (-10.0, 10.0).
                         This constrains mutation effect parameters during optimization
                         to prevent extreme values.
+        verbose: Whether to print progress information during fitting (default: True).
+                If False, suppresses all print output.
 
     Returns:
         Tuple of (fitted model, loss trajectory).
@@ -571,7 +574,8 @@ def prox_block(β_block, hyperparameters, scaling=1.0):
 
     try:
         for k in range(block_iters):
-            print(f"iter {k + 1}:")
+            if verbose:
+                print(f"iter {k + 1}:")
             obj_old = objective_total(
                 model,
                 data_sets,
@@ -589,13 +593,16 @@ def prox_block(β_block, hyperparameters, scaling=1.0):
                 model_calibration, model_rest, data_sets, scale=scale
             )
             model = eqx.combine(model_calibration, model_rest)
-            print(
-                f"  calibration block: error={state_calibration.error:.2e}, "
-                f"stepsize={state_calibration.stepsize:.1e}, "
-                f"iter={state_calibration.iter_num}"
-            )
-            for d in model.φ:
-                print(f"    {d}: α={model.α[d]:.2f}, θ={jnp.exp(model.logθ[d]):.2f}")
+            if verbose:
+                print(
+                    f"  calibration block: error={state_calibration.error:.2e}, "
+                    f"stepsize={state_calibration.stepsize:.1e}, "
+                    f"iter={state_calibration.iter_num}"
+                )
+                for d in model.φ:
+                    print(
+                        f"    {d}: α={model.α[d]:.2f}, θ={jnp.exp(model.logθ[d]):.2f}"
+                    )
 
             # β0 block
             model_β0, model_rest = eqx.partition(model, filter_spec=filter_spec_β0)
@@ -607,12 +614,13 @@ def prox_block(β_block, hyperparameters, scaling=1.0):
                 beta0_ridge=beta0_ridge,
             )
             model = eqx.combine(model_β0, model_rest)
-            print(
-                f"  β0 block: error={state_β0.error:.2e}, "
-                f"stepsize={state_β0.stepsize:.1e}, iter={state_β0.iter_num}"
-            )
-            for d in model.φ:
-                print(f"    {d}: β0={model.φ[d].β0:.2f}")
+            if verbose:
+                print(
+                    f"  β0 block: error={state_β0.error:.2e}, "
+                    f"stepsize={state_β0.stepsize:.1e}, iter={state_β0.iter_num}"
+                )
+                for d in model.φ:
+                    print(f"    {d}: β0={model.φ[d].β0:.2f}")
 
             # determine bundle idxs (mutations that are non-wt in any condition)
             bundle_idxs = jax.lax.associative_scan(
@@ -644,11 +652,12 @@ def prox_block(β_block, hyperparameters, scaling=1.0):
                     model,
                     model.φ[d].β.at[idxs].set(β_block[d]),
                 )
-            print(
-                f"  β_nonbundle: error={state_nonbundle.error:.2e}, "
-                f"stepsize={state_nonbundle.stepsize:.1e}, "
-                f"iter={state_nonbundle.iter_num}"
-            )
+            if verbose:
+                print(
+                    f"  β_nonbundle: error={state_nonbundle.error:.2e}, "
+                    f"stepsize={state_nonbundle.stepsize:.1e}, "
+                    f"iter={state_nonbundle.iter_num}"
+                )
 
             # β bundle block
             idxs = jnp.where(bundle_idxs)[0]
@@ -674,19 +683,21 @@ def prox_block(β_block, hyperparameters, scaling=1.0):
                     model,
                     model.φ[d].β.at[idxs].set(β_block[d]),
                 )
-            print(
-                f"  β_bundle: error={state_bundle.error:.2e}, "
-                f"stepsize={state_bundle.stepsize:.1e}, "
-                f"iter={state_bundle.iter_num}"
-            )
+            if verbose:
+                print(
+                    f"  β_bundle: error={state_bundle.error:.2e}, "
+                    f"stepsize={state_bundle.stepsize:.1e}, "
+                    f"iter={state_bundle.iter_num}"
+                )
 
             # diagnostics
-            for d in model.φ:
-                if d != model.reference_condition:
-                    sparsity = (
-                        model.φ[d].β - model.φ[model.reference_condition].β == 0
-                    ).mean()
-                    print(f"  {d} sparsity={sparsity:.1%}")
+            if verbose:
+                for d in model.φ:
+                    if d != model.reference_condition:
+                        sparsity = (
+                            model.φ[d].β - model.φ[model.reference_condition].β == 0
+                        ).mean()
+                        print(f"  {d} sparsity={sparsity:.1%}")
 
             obj = objective_total(
                 model,
@@ -696,9 +707,11 @@ def prox_block(β_block, hyperparameters, scaling=1.0):
                 scale=scale,
                 beta0_ridge=beta0_ridge,
             )
-            print(f"  {obj=:.2e}")
+            if verbose:
+                print(f"  {obj=:.2e}")
             objective_error = abs(obj_old - obj) / max(abs(obj_old), abs(obj), 1)
-            print(f"  {objective_error=:.2e}")
+            if verbose:
+                print(f"  {objective_error=:.2e}")
 
             # store loss for trajectory
             loss_trajectory.append(float(obj))
diff --git a/multidms/model.py b/multidms/model.py
@@ -137,6 +137,7 @@ def fit(
         ge_kwargs: dict = None,
         cal_kwargs: dict = None,
         loss_kwargs: dict = None,
+        verbose: bool = True,
     ):
         """
         Fit the model to data.
@@ -164,6 +165,8 @@ def fit(
             Keyword arguments for calibration (α) optimizer (e.g., tol, maxiter, maxls).
         loss_kwargs : dict, optional
             Keyword arguments for the loss function (e.g., δ for Huber loss).
+        verbose : bool
+            Whether to print progress information during fitting (default: True).
 
         Returns
         -------
@@ -219,6 +222,7 @@ def fit(
             ge_kwargs=ge_kwargs,
             cal_kwargs=cal_kwargs,
             loss_kwargs=loss_kwargs,
+            verbose=verbose,
         )
 
         return self
@@ -314,29 +318,176 @@ def get_variants_df(self, phenotype_as_effect: bool = True) -> pd.DataFrame:
     def add_phenotypes_to_df(
         self,
         df: pd.DataFrame,
-        phenotype_as_effect: bool = True,
+        substitutions_col: str = "aa_substitutions",
+        condition_col: str = "condition",
+        predicted_phenotype_col: str = "predicted_func_score",
+        overwrite_cols: bool = False,
     ) -> pd.DataFrame:
         """
         Add model predictions to a DataFrame of variants.
 
         Parameters
         ----------
         df : pd.DataFrame
-            DataFrame with 'condition' and 'aa_substitutions' columns.
-        phenotype_as_effect : bool
-            If True, report effects. If False, report raw latent phenotypes.
+            DataFrame with columns specified by `condition_col` and
+            `substitutions_col`. Additional columns will be preserved in output.
+        substitutions_col : str
+            Column in `df` giving variants as substitution strings.
+            Default is 'aa_substitutions'.
+        condition_col : str
+            Column in `df` giving the condition for each variant.
+            Values must exist in the model's conditions. Default is 'condition'.
+        predicted_phenotype_col : str
+            Name of column to add containing predicted functional scores.
+            Default is 'predicted_func_score'.
+        overwrite_cols : bool
+            If the specified predicted phenotype column already exists in `df`,
+            overwrite it? If False, raise an error.
 
         Returns
         -------
         pd.DataFrame
-            Input DataFrame with added prediction columns.
+            A copy of `df` with predictions added.
+
+        Raises
+        ------
+        ValueError
+            If model is not fitted, required columns are missing, indices are
+            not unique, conditions are invalid, or substitutions contain
+            mutations not seen during training.
+
+        Example
+        -------
+        >>> import pandas as pd
+        >>> from multidms import Data, Model
+        >>> df_train = pd.DataFrame({
+        ...     'condition': ['a', 'a', 'b', 'b'],
+        ...     'aa_substitutions': ['', 'M1A', '', 'M1A'],
+        ...     'func_score': [0.0, 1.2, 0.1, 1.5]
+        ... })
+        >>> data = Data(df_train, reference='a')  # doctest: +ELLIPSIS
+        >>> model = Model(data, ge_type='Identity', l2reg=0.01)
+        >>> _ = model.fit(maxiter=5, warmstart=False, verbose=False)
+        >>> df_new = pd.DataFrame({
+        ...     'condition': ['a', 'b'],
+        ...     'aa_substitutions': ['M1A', 'M1A']
+        ... })
+        >>> result = model.add_phenotypes_to_df(df_new)
+        >>> 'predicted_func_score' in result.columns
+        True
+        >>> len(result)
+        2
         """
         if self._jax_model is None:
             raise ValueError("Model has not been fitted. Call fit() first.")
 
-        # See issue #173 for implementing prediction on new variants
-        # and calling predict_score()
-        raise NotImplementedError("add_phenotypes_to_df is not yet implemented in v2.0")
+        # Validate input
+        if substitutions_col not in df.columns:
+            raise ValueError(f"`df` lacks column '{substitutions_col}'")
+        if condition_col not in df.columns:
+            raise ValueError(f"`df` lacks column '{condition_col}'")
+        if not df.index.is_unique:
+            raise ValueError("`df` must have unique indices")
+
+        # Check for invalid conditions
+        invalid_conditions = set(df[condition_col]) - set(self._data.conditions)
+        if invalid_conditions:
+            raise ValueError(
+                f"Invalid conditions in df: {invalid_conditions}. "
+                f"Valid conditions: {self._data.conditions}"
+            )
+
+        # Return copy
+        ret = df.copy()
+
+        # Check if column exists and handle overwrite
+        if predicted_phenotype_col in ret.columns and not overwrite_cols:
+            raise ValueError(
+                f"`df` already contains column '{predicted_phenotype_col}'. "
+                "Set overwrite_cols=True to overwrite."
+            )
+
+        # Initialize prediction column
+        ret[predicted_phenotype_col] = np.nan
+
+        # Get reference binarymap for encoding
+        ref_bmap = self._data.binarymaps[self._data.reference]
+
+        # Process each condition separately
+        for condition, condition_df in df.groupby(condition_col):
+            # Convert substitutions to reference frame if needed
+            variant_subs = condition_df[substitutions_col]
+            if condition not in self._data.reference_sequence_conditions:
+                variant_subs = condition_df.apply(
+                    lambda x: self._data.convert_subs_wrt_ref_seq(
+                        condition, x[substitutions_col]
+                    ),
+                    axis=1,
+                )
+
+            # Build binary variant matrix
+            row_ind = []  # row indices of elements that are one
+            col_ind = []  # column indices of elements that are one
+            unseen_mutations = set()
+
+            for ivariant, subs in enumerate(variant_subs):
+                try:
+                    for isub in ref_bmap.sub_str_to_indices(subs):
+                        row_ind.append(ivariant)
+                        col_ind.append(isub)
+                except ValueError:
+                    # Extract the individual mutations that are unseen
+                    if subs:  # non-empty string
+                        for mut in subs.split():
+                            if mut not in self._data.mutations:
+                                unseen_mutations.add(mut)
+
+            # If there are unseen mutations, raise an error
+            if unseen_mutations:
+                raise ValueError(
+                    f"Variants contain mutations not seen during training: "
+                    f"{sorted(unseen_mutations)}"
+                )
+
+            # Create sparse matrix
+            import scipy.sparse
+            from jax.experimental import sparse as jsparse
+
+            X = jsparse.BCOO.from_scipy_sparse(
+                scipy.sparse.csr_matrix(
+                    (np.ones(len(row_ind), dtype="int8"), (row_ind, col_ind)),
+                    shape=(len(condition_df), ref_bmap.binarylength),
+                    dtype="int8",
+                )
+            )
+
+            # Create jaxmodels.Data object for this condition
+            # We need x_wt from the training data
+            x_wt = self._jax_data_sets[condition].x_wt
+
+            # Create a temporary Data object with dummy functional scores
+            import multidms.jaxmodels as jaxmodels
+
+            temp_data = jaxmodels.Data(
+                x_wt=x_wt,
+                X=X,
+                functional_scores=np.zeros(len(condition_df)),  # dummy values
+            )
+
+            # Make predictions using jaxmodels
+            temp_data_sets = {condition: temp_data}
+            predictions = self._jax_model.predict_score(temp_data_sets)
+
+            # Extract predictions for this condition
+            phenotype_predictions = np.array(predictions[condition])
+            assert len(phenotype_predictions) == len(condition_df)
+
+            # Add predictions to result dataframe
+            ret.loc[
+                condition_df.index.values, predicted_phenotype_col
+            ] = phenotype_predictions
+
+        return ret
 
     def __repr__(self):
         """String representation."""
diff --git a/tests/test_model.py b/tests/test_model.py