make new localization file

mffrank · mffrank · commit 8db75e603794 · 2026-02-02T06:42:24.000-08:00
diff --git a/grassp/tests/test_plotting_integration.py b/grassp/tests/test_plotting_integration.py
@@ -21,7 +21,7 @@
 
 from grassp.plotting import clustering, heatmaps, integration, qc, ternary  # noqa: E402
 from grassp.preprocessing import enrichment, simple  # noqa: E402
-from grassp.tools import clustering as tl_clustering  # noqa: E402
+from grassp.tools import localization as tl_localization  # noqa: E402
 from grassp.tools import scoring  # noqa: E402
 
 # ==============================================================================
@@ -211,7 +211,7 @@ def test_knn_violin_smoke(self):
         )
 
         # Run KNN annotation to get predictions
-        tl_clustering.knn_annotation(
+        tl_localization.knn_annotation(
             adata, gt_col="markers", key_added="knn_pred", min_probability=0
         )
 
diff --git a/grassp/tests/test_tools_integration.py b/grassp/tests/test_tools_integration.py
@@ -24,6 +24,7 @@
     clustering,
     enrichment,
     integration,
+    localization,
     scoring,
     tagm,
 )
@@ -273,7 +274,7 @@ def test_knn_annotation_basic(self):
             n_proteins=100, marker_fraction=0.3, add_neighbors=True
         )
 
-        clustering.knn_annotation(
+        localization.knn_annotation(
             adata,
             gt_col="markers",
             key_added="knn_annotation",
@@ -294,7 +295,7 @@ def test_knn_annotation_fix_markers(self):
             n_proteins=100, marker_fraction=0.3, add_neighbors=True
         )
 
-        clustering.knn_annotation(
+        localization.knn_annotation(
             adata,
             gt_col="markers",
             key_added="knn_fixed",
@@ -632,7 +633,7 @@ def test_knn_f1_score_with_prediction(self):
         )
 
         # First create predictions (use min_probability=0 to get all predictions)
-        clustering.knn_annotation(
+        localization.knn_annotation(
             adata, gt_col="markers", key_added="predictions", min_probability=0
         )
 
@@ -1220,7 +1221,7 @@ def test_clustering_annotation_workflow(self):
         assert "mc_cluster" in adata.obs.columns
 
         # Step 2: KNN annotation
-        clustering.knn_annotation(
+        localization.knn_annotation(
             adata,
             gt_col="markers",
             key_added="knn_annotation",
@@ -1305,7 +1306,7 @@ def test_knn_annotation_missing_column(self):
         adata = make_enriched_data_with_structure(n_proteins=50, add_neighbors=True)
 
         with pytest.raises(KeyError):
-            clustering.knn_annotation(adata, gt_col="nonexistent_column")
+            localization.knn_annotation(adata, gt_col="nonexistent_column")
 
     def test_silhouette_score_missing_embedding(self):
         """Test error when embedding not found."""
diff --git a/grassp/tools/__init__.py b/grassp/tools/__init__.py
@@ -1,13 +1,13 @@
 from .clustering import (
     calculate_interfacialness_score,
     get_n_nearest_neighbors,
-    knn_annotation,
     leiden_mito_sweep,
     markov_clustering,
     to_knn_graph,
 )
 from .enrichment import calculate_cluster_enrichment, rank_proteins_groups
 from .integration import align_adatas, aligned_umap, mr_score, remodeling_score
+from .localization import knn_annotation, knn_annotation_old
 from .scoring import (
     calinski_habarasz_score,
     class_balance,
diff --git a/grassp/tools/clustering.py b/grassp/tools/clustering.py
@@ -11,6 +11,8 @@
 import pandas as pd
 import scanpy as sc
 
+from .localization import _get_knn_annotation_df
+
 
 def _get_clusters(matrix):
     # get the attractors - non-zero elements of the matrix diagonal
@@ -149,157 +151,6 @@ def leiden_mito_sweep(
         data.uns["leiden"]["mito_majority_fraction"] = mito_majority_fraction
 
 
-def _get_knn_annotation_df(
-    data: AnnData, obs_ann_col: str, exclude_category: str | List[str] | None = None
-) -> pd.DataFrame:
-    """
-    Get a dataframe with a column of .obs repeated for each protein.
-    """
-    nrow = data.obs.shape[0]
-    obs_ann = data.obs[obs_ann_col]
-    if isinstance(exclude_category, str):
-        exclude_category = [exclude_category]
-    if exclude_category is not None:
-        obs_ann.replace(exclude_category, np.nan, inplace=True)
-
-    df = pd.DataFrame(np.tile(obs_ann, (nrow, 1)))
-    return df
-
-
-def knn_annotation(
-    data,
-    gt_col,
-    fix_markers=False,
-    class_balance=True,
-    min_probability=0.5,
-    inplace=True,
-    obsp_key="connectivities",
-    key_added="knn_annotation",
-):
-    """Propagate categorical annotations along the *k*-NN graph.
-
-    For each observation the function inspects its neighbourhood in
-    ``adata.obsp[obsp_key]`` (generated by :func:`scanpy.pp.neighbors`) and
-    calculates the a weighted probability for each label category.
-
-    Parameters
-    ----------
-    data
-        :class:`anndata.AnnData` with a populated neighbour graph (*distances*
-        or *connectivities*).
-    gt_col
-        Observation column containing the *source* annotations to be
-        propagated.
-    fix_markers
-        If ``True`` marker probabilities do not get overwritten by the propagated labels.
-    class_balance
-        If ``True`` ground truth compartments with a lot of proteins are downweighted proportional to their size to prevent them from dominating the propagated labels.
-    min_probability
-        If the probability of the most probable label is below this threshold, the label is set to ``np.nan``.
-    obsp_key
-        Name of the neighbour connectivity graph to use (default ``"connectivities"``).
-    key_added
-        Name of the new column that will hold the propagated annotation
-        (default ``"knn_annotation"``).
-
-    Returns
-    -------
-    Modified anndata object with the following new entries:
-    - .obsm[f"{key_added}_probabilities"] containing the propagated probabilities
-    - .obs[f"{key_added}"] containing the propagated labels (most probable label)
-    - .uns[f"{key_added}_colors"] to make sure plotting uses the same colors as the ground truth labels
-    - .obs[f"{key_added}_probability"] containing the probability of the most probable label
-    """
-    labels = data.obs[gt_col].astype("category")
-    labels_one_hot = pd.get_dummies(labels).values
-    T = data.obsp[obsp_key]
-    # Propagate the labels with transition matrix T
-    Y = T @ labels_one_hot
-    Y[Y.sum(axis=1) == 0] = 1 / Y.shape[1]
-
-    # Class balance
-    if class_balance:
-        # gt_compartments with a lot of proteins are more likely to be in the neighborhood of a protein
-        # Adjust probability based on the number of proteins in the compartment
-        Y = Y / np.nansum(Y, axis=0) * labels_one_hot.sum(axis=0)
-        #
-    # Normalize the propagated labels to get probabilities
-    if any(Y.sum(axis=1) == 0):
-        print(Y[Y.sum(axis=1) == 0])
-    Y = Y / np.nansum(Y, axis=1)[:, None]
-
-    if fix_markers:
-        # Set markers to 1
-        marker_mask = labels_one_hot.sum(axis=1) == 1
-        Y[marker_mask] = labels_one_hot[marker_mask].astype(float)
-
-    if inplace:
-        data.obsm[f"{key_added}_probabilities"] = Y
-        data.obsm[f"{key_added}_one_hot_labels"] = labels_one_hot
-        data.obs[f"{key_added}"] = labels.cat.categories[Y.argmax(axis=1)]
-        data.obs[f"{key_added}_probability"] = np.max(Y, axis=1)
-        data.obs.loc[
-            data.obs[f"{key_added}_probability"] < min_probability, f"{key_added}"
-        ] = np.nan
-        if f"{gt_col}_colors" in data.uns:
-            data.uns[f"{key_added}_colors"] = data.uns[f"{gt_col}_colors"]
-    else:
-        return {
-            "probabilities": Y,
-            "labels": labels.cat.categories,
-            "one_hot_labels": labels_one_hot,
-        }
-
-
-def knn_annotation_old(
-    data: AnnData,
-    obs_ann_col: str,
-    key_added: str = "consensus_graph_annotation",
-    exclude_category: str | List[str] | None = None,
-    inplace: bool = True,
-) -> AnnData | None:
-    """Propagate categorical annotations along the *k*-NN graph.
-
-    For each observation the function inspects its neighbourhood in
-    ``adata.obsp['distances']`` (generated by :func:`scanpy.pp.neighbors`) and
-    assigns the majority category found in ``obs_ann_col``.  Ties are broken
-    arbitrarily using :func:`pandas.DataFrame.mode`.
-
-    Parameters
-    ----------
-    data
-        :class:`anndata.AnnData` with a populated neighbour graph (*distances*
-        or *connectivities*).
-    obs_ann_col
-        Observation column containing the *source* annotations to be
-        propagated.
-    key_added
-        Name of the new column that will hold the *consensus* annotation
-        (default ``"consensus_graph_annotation"``).
-    exclude_category
-        One or multiple category labels that should be ignored when computing
-        the neighbourhood majority (useful for *unknown* / *NA* categories).
-    inplace
-        If ``True`` (default) modify *data* in place.  Otherwise return a
-        copy with the additional column.
-
-    Returns
-    -------
-    Modified object when ``inplace`` is ``False`` with a new column in .obs[key_added].
-    """
-    df = _get_knn_annotation_df(data, obs_ann_col, exclude_category)
-
-    conn = data.obsp["distances"]
-    mask = ~(conn != 0).todense()  # This avoids expensive conn == 0 for sparse matrices
-    df[mask] = np.nan
-
-    majority_cluster = df.mode(axis=1, dropna=True).loc[
-        :, 0
-    ]  # take the first if there are ties
-    data.obs[key_added] = majority_cluster.values
-    return data if not inplace else None
-
-
 def to_knn_graph(
     data: AnnData,
     node_label_column: str | None = None,
diff --git a/grassp/tools/localization.py b/grassp/tools/localization.py
diff --git a/grassp/tools/scoring.py b/grassp/tools/scoring.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`
`22`	`22`	`from grassp.plotting import clustering, heatmaps, integration, qc, ternary # noqa: E402`
`23`	`23`	`from grassp.preprocessing import enrichment, simple # noqa: E402`
`24`		`-from grassp.tools import clustering as tl_clustering # noqa: E402`
	`24`	`+from grassp.tools import localization as tl_localization # noqa: E402`
`25`	`25`	`from grassp.tools import scoring # noqa: E402`
`26`	`26`
`27`	`27`	`# ==============================================================================`
`@@ -211,7 +211,7 @@ def test_knn_violin_smoke(self):`
`211`	`211`	`)`
`212`	`212`
`213`	`213`	`# Run KNN annotation to get predictions`
`214`		`- tl_clustering.knn_annotation(`
	`214`	`+ tl_localization.knn_annotation(`
`215`	`215`	`adata, gt_col="markers", key_added="knn_pred", min_probability=0`
`216`	`216`	`)`
`217`	`217`