Merge pull request #1 from czbiohub-sf/dev

mffrank · web-flow · commit 59fdec408d77 · 2025-05-09T09:38:27.000-07:00
Markov clustering, ternary plots, update license metadata
diff --git a/grassp/plotting/__init__.py b/grassp/plotting/__init__.py
@@ -1,5 +1,6 @@
 # from .heatmaps import grouped_heatmap
 from .heatmaps import protein_clustermap, sample_heatmap, qsep_heatmap, qsep_boxplot
-from .integration import aligned_umap, remodeling_sankey, remodeling_score
+from .integration import aligned_umap, remodeling_sankey, remodeling_score, mr_plot
 from .qc import bait_volcano_plots, highly_variable_proteins
 from .clustering import tagm_map_contours, tagm_map_pca_ellipses
+from .ternary import ternary
diff --git a/grassp/plotting/heatmaps.py b/grassp/plotting/heatmaps.py
@@ -210,6 +210,8 @@ def qsep_heatmap(
     normalize: bool = True,
     ax: plt.Axes = None,
     cmap: str = "RdBu_r",
+    vmin: float = None,
+    vmax: float = None,
     **kwargs,
 ) -> plt.Axes:
     """Plot QSep cluster distance heatmap.
@@ -248,12 +250,17 @@ def qsep_heatmap(
         # Normalize by diagonal values
         norm_distances = distances / np.diag(distances)[:, np.newaxis]
         plot_data = norm_distances[::-1]
-        vmin = 1.0
-        vmax = np.max(norm_distances)
+        tvmin = 1.0
+        tvmax = np.max(norm_distances)
     else:
         plot_data = distances[::-1]
-        vmin = None
-        vmax = None
+        tvmin = None
+        tvmax = None
+
+    if vmin is None:
+        vmin = tvmin
+    if vmax is None:
+        vmax = tvmax
 
     # Create heatmap
     sns.heatmap(
diff --git a/grassp/plotting/ternary.py b/grassp/plotting/ternary.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from typing import Optional, List
+    from anndata import AnnData
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+from scanpy.plotting._tools.scatterplots import (
+    _color_vector,
+    _get_color_source_vector,
+    _add_categorical_legend,
+    _get_palette,
+)
+
+
+def ternary(
+    adata: AnnData,
+    color: Optional[str] = None,
+    ax=None,
+    labels: Optional[List[str]] = None,
+    show: bool = True,
+    colorbar_loc: Optional[str] = None,
+    legend_loc: Optional[str] = None,
+    legend_fontweight: Optional[str] = None,
+    legend_fontsize: Optional[int] = None,
+    legend_fontoutline: Optional[str] = None,
+    na_in_legend: Optional[bool] = None,
+    **kwargs,
+):
+    try:
+        import mpltern
+    except ImportError:
+        raise ImportError(
+            "mpltern is not installed. Please install it with `pip install mpltern`"
+        )
+    if adata.X.shape[1] != 3:
+        raise ValueError("Ternary plots requires adata object with 3 samples (columns)")
+    if ax is None:
+        ax = plt.subplot(projection="ternary")
+    if labels is None:
+        labels = adata.var_names
+
+    csv = _get_color_source_vector(adata, color)
+
+    cv, color_type = _color_vector(adata, values_key=color, values=csv, palette=None)
+
+    # Make sure that nan values are plottted below the other points
+    nan_mask = np.isnan(csv) if isinstance(csv, np.ndarray) else csv.isna()
+    if nan_mask.any():
+        nan_points = adata[nan_mask].X
+        ax.scatter(
+            nan_points[:, 0],
+            nan_points[:, 1],
+            nan_points[:, 2],
+            c=cv[nan_mask],
+            **kwargs,
+            zorder=0,
+        )
+    cax = ax.scatter(
+        adata.X[~nan_mask, 0],
+        adata.X[~nan_mask, 1],
+        adata.X[~nan_mask, 2],
+        zorder=1,
+        c=cv[~nan_mask],
+        **kwargs,
+    )
+    ax.taxis.set_label_position("tick1")
+    ax.raxis.set_label_position("tick1")
+    ax.laxis.set_label_position("tick1")
+    ax.set_tlabel(labels[0])
+    ax.set_llabel(labels[1])
+    ax.set_rlabel(labels[2])
+
+    if color_type == "cat":
+        _add_categorical_legend(
+            ax,
+            csv,
+            palette=_get_palette(adata, color),
+            scatter_array=None,
+            legend_loc=legend_loc,
+            legend_fontweight=legend_fontweight,
+            legend_fontsize=legend_fontsize,
+            legend_fontoutline=legend_fontoutline,
+            na_color="grey",
+            na_in_legend=na_in_legend,
+            multi_panel=False,
+        )
+    elif colorbar_loc is not None:
+        plt.colorbar(
+            cax, ax=ax, pad=0.01, fraction=0.08, aspect=30, location=colorbar_loc
+        )
+    if show:
+        plt.show()
+    return ax
diff --git a/grassp/preprocessing/enrichment.py b/grassp/preprocessing/enrichment.py
@@ -11,6 +11,19 @@
 import warnings
 
 
+def _check_covariates(data: AnnData, covariates: Optional[list[str]] = None):
+    if covariates is None:
+        covariates = data.var.columns[data.var.columns.str.startswith("covariate_")]
+    # Check that all covariates are in the data
+    for c in covariates:
+        if c not in data.var.columns:
+            raise ValueError(f"Covariate {c} not found in data.var.columns")
+
+    if not isinstance(covariates, list):
+        covariates = [covariates]
+    return covariates
+
+
 def calculate_enrichment_vs_untagged(
     data: AnnData,
     covariates: Optional[list[str]] = [],
@@ -120,6 +133,17 @@ def calculate_enrichment_vs_untagged(
     return data_aggr
 
 
+def calculate_noc_proportions(
+    adata: AnnData,
+    covariates: Optional[list[str]] = None,
+    subcellular_enrichment_column: str = "subcellular_enrichment",
+    use_layer: Optional[str] = None,
+    original_intensities_key: str | None = None,
+    keep_raw: bool = True,
+) -> AnnData:
+    pass
+
+
 def calculate_enrichment_vs_all(
     adata: AnnData,
     covariates: Optional[list[str]] = None,
@@ -153,9 +177,10 @@ def calculate_enrichment_vs_all(
 
     data = adata.copy()
 
-    # if covariates is None:
-    #     covariates = data.var.columns[data.var.columns.str.startswith("covariate_")]
-    # else:
+    if covariates is None:
+        covariates = data.var.columns[
+            data.var.columns.str.startswith("covariate_")
+        ].tolist()
     # Check that all covariates are in the data
     for c in covariates:
         if c not in data.var.columns:
@@ -198,7 +223,7 @@ def calculate_enrichment_vs_all(
         lfc = np.median(intensities_ip, axis=1) - np.median(intensities_control, axis=1)
         aggr_mask = data_aggr.var["_experimental_condition"] == experimental_condition
         data_aggr.layers["pvals"][:, aggr_mask] = pv[:, None]
-        data_aggr[:, aggr_mask].X = lfc[:, None]
+        data_aggr.X[:, aggr_mask] = lfc[:, None]
         data_aggr.var.loc[aggr_mask, "enriched_vs"] = ",".join(
             data_aggr.var_names[control_mask]
         )
diff --git a/grassp/preprocessing/simple.py b/grassp/preprocessing/simple.py
@@ -532,7 +532,7 @@ def normalize_total(
 
 def drop_excess_MQ_metadata(
     data: AnnData,
-    colname_regex: str = "Peptide|peptide|MS/MS|Evidence IDs|Taxonomy|Oxidation|Intensity|Identification type|Sequence coverage|MS/MS count",
+    colname_regex: str = "Peptide|peptide|MS/MS|Evidence IDs|Taxonomy|Oxidation|Intensity|Total Spectral Count|Unique Spectral Count|Spectral Count|Identification type|Sequence coverage|MS/MS count",
     inplace: bool = True,
 ) -> AnnData | None:
     """Drop excess metadata columns from MaxQuant output.
diff --git a/grassp/tools/__init__.py b/grassp/tools/__init__.py
@@ -9,7 +9,7 @@
     tagm_map_predict,
 )
 from .enrichment import calculate_cluster_enrichment, rank_proteins_groups
-from .integration import align_adatas, aligned_umap, remodeling_score
+from .integration import align_adatas, aligned_umap, remodeling_score, mr_score
 from .scoring import (
     calinski_habarasz_score,
     class_balance,
diff --git a/grassp/tools/scoring.py b/grassp/tools/scoring.py
@@ -205,13 +205,16 @@ def qsep_score(
     }
 
     for i, cluster1 in enumerate(valid_clusters):
-        for j, cluster2 in enumerate(valid_clusters):
+        for j in range(i, len(valid_clusters)):
+            # for j, cluster2 in enumerate(valid_clusters[i + 1 :]):
+            cluster2 = valid_clusters[j]
             idx1 = cluster_indices[cluster1]
             idx2 = cluster_indices[cluster2]
 
             # Get submatrix of distances between clusters
             submatrix = full_distances[np.ix_(idx1, idx2)]
             cluster_distances[i, j] = np.mean(submatrix)
+            cluster_distances[j, i] = np.mean(submatrix)
 
     if inplace:
         # Store full distances
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 name =  "grassp"
 description = "A python package to facilitate Organellar profiling"
 readme = "README.md"
-license = {file = "LICENSE"}
+license = {text = "BSD 3-Clause License"}
 requires-python = ">=3.7, <4.0"
 
 # the dynamically determined project metadata attributes
@@ -38,6 +38,7 @@ dependencies = [
     "umap-learn",
     "pysankeybeta",
     "gseapy",
+    "markov_clustering",
 ]
 
 
@@ -84,6 +85,9 @@ packages = ["grassp"]
 # allow use of __file__ to load data files included in the package
 zip-safe = false
 
+# Don't include LICENSE as a license-file in the metadata
+license-files = []
+
 
 [tool.black]
 line-length = 95

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@`
`9`	`9`	`tagm_map_predict,`
`10`	`10`	`)`
`11`	`11`	`from .enrichment import calculate_cluster_enrichment, rank_proteins_groups`
`12`		`-from .integration import align_adatas, aligned_umap, remodeling_score`
	`12`	`+from .integration import align_adatas, aligned_umap, remodeling_score, mr_score`
`13`	`13`	`from .scoring import (`
`14`	`14`	`calinski_habarasz_score,`
`15`	`15`	`class_balance,`