[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 075dc9c8d7f2 · 2025-05-26T20:54:58.000Z
for more information, see https://pre-commit.ci
diff --git a/docs/notebooks/paul15_mouse_hematopoiesis.ipynb b/docs/notebooks/paul15_mouse_hematopoiesis.ipynb
@@ -1158,4 +1158,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
+}
diff --git a/src/eschr/tl/_clustering.py b/src/eschr/tl/_clustering.py
@@ -1,15 +1,13 @@
 import math
 import random
-import time
 import traceback
 import warnings
 
 import igraph as ig
 import leidenalg as la
-import zarr
 import numpy as np
-import pandas as pd
-from scipy.sparse import coo_matrix, csr_matrix, lil_matrix, diags
+import zarr
+from scipy.sparse import coo_matrix, lil_matrix
 from sklearn_ann.kneighbors.annoy import AnnoyTransformer
 
 from ._prune_features import run_pca_dim_reduction
@@ -20,6 +18,7 @@
 # Hyperparameter Utils
 ########################################################################################################################################################
 
+
 def get_subsamp_size(n):  # n==data.shape[0]
     """
     Generate subsample size.
@@ -91,27 +90,30 @@ def get_hyperparameters(k_range, la_res_range, metric=None):
         metric = ["euclidean", "cosine"][random.sample(range(2), 1)[0]]
     return k, la_res, metric
 
+
 ########################################################################################################################################################
 # Clustering Utils
 ########################################################################################################################################################
 
+
 def sparse_put_clusters(n_orig, subsample_ids, cluster_values):
     """Create a sparse cluster matrix without using put_along_axis"""
-    
+
     # Get number of clusters (accounting for zero as non-cluster)
     n_clusters = len(np.unique(cluster_values))
-    
+
     # Create COO matrix directly from indices and values
     # For each data point in subsample_ids, create a 1 in its cluster column
     rows = subsample_ids
     cols = cluster_values
     data = np.ones_like(subsample_ids, dtype=np.uint8)
-    
+
     # Create the sparse matrix
     c = coo_matrix((data, (rows, cols)), shape=(n_orig, n_clusters))
-    
+
     return c
 
+
 # Util adapted from scanpy:
 def get_igraph_from_adjacency(adjacency, directed=None):
     """Get igraph graph from adjacency matrix."""
@@ -180,6 +182,7 @@ def run_la_clustering(X, k, la_res, metric="euclidean", method="sw-graph"):
     # print ("time to run leiden clustering: " + str(time_leiden))
     return np.array([leiden_out.membership])
 
+
 def get_hard_soft_clusters(n, clustering, bg):
     """
     Generate hard and soft clusters for a single bipartite clustering.
@@ -199,68 +202,70 @@ def get_hard_soft_clusters(n, clustering, bg):
         Hard cluster assignments for every sample.
     soft_membership_matrix : :class:`scipy.sparse.csr_matrix`
         Contains membership values for each sample in each consensus cluster.
-    """    
+    """
     # Identify cluster vertices
     clusters_vertex_ids = np.array(bg.vs.indices)[[x >= n for x in bg.vs.indices]]
     # Get unique cluster assignments
     cells_clusts = np.unique(clustering)
     # Create mapping from cluster ID to column index
     clust_id_to_idx = {clust_id: idx for idx, clust_id in enumerate(cells_clusts)}
-    
+
     # Initialize sparse matrix in LIL format (efficient for incremental construction)
     clust_occ_mat = lil_matrix((n, len(cells_clusts)), dtype=int)
-    
+
     # Process each cluster
     for cluster_id in cells_clusts:
         # Get the vertices corresponding to this cluster
         cluster_memb = [
             clusters_vertex_ids[i] for i, j in enumerate(clustering) if j == cluster_id
         ]
-        
+
         # Get the edges from cells to this cluster
         edges = bg.es.select(_source_in=cluster_memb)
-        
+
         if edges:
             # Get the source nodes and their counts
             sources = [e.source for e in edges]
             source_nodes, counts = np.unique(sources, return_counts=True)
-            
+
             # Update the sparse matrix for this cluster
             col_idx = clust_id_to_idx[cluster_id]
             clust_occ_mat[source_nodes, col_idx] = counts
-    
+
     # Convert to CSR format for efficient row operations
     clust_occ_csr = clust_occ_mat.tocsr()
-    
+
     # Find the max value index for each row (for hard assignments)
-    row_maxes = []
     hard_clusters = np.zeros(n, dtype=int)
-    
+
     # Process each row to find max value index
     for i in range(n):
         row = clust_occ_csr[i].toarray().flatten()
         if np.any(row > 0):  # Check if row has any non-zero values
             max_indices = np.where(row == row.max())[0]
             hard_clusters[i] = np.random.choice(max_indices)
-    
+
     # Create the soft membership matrix (normalize rows)
     row_sums = clust_occ_csr.sum(axis=1).A.flatten()
     # Avoid division by zero
     row_sums[row_sums == 0] = 1
-    
+
     # Create a diagonal matrix with 1/row_sum
     from scipy.sparse import diags
+
     row_sum_diag_inv = diags(1.0 / row_sums, 0)
-    
+
     # Multiply to normalize rows
     soft_membership_matrix = row_sum_diag_inv @ clust_occ_csr
-    
+
     return hard_clusters, soft_membership_matrix
 
+
 ########################################################################################################################################################
 # Main clustering
 ########################################################################################################################################################
 
+
 def run_base_clustering(args_in):
     """
     Run a single iteration of leiden clustering.
@@ -269,7 +274,7 @@ def run_base_clustering(args_in):
     ----------
     args_in : zip
         List containing each hyperparameter required for one round of
-        clustering (k, la_res, metric, subsample_size) as well as the 
+        clustering (k, la_res, metric, subsample_size) as well as the
         sparse boolean and the path to the zarr data store.
 
     Returns
@@ -284,7 +289,7 @@ def run_base_clustering(args_in):
         zarr_loc = args_in[0]
         hyperparams_ls = args_in[1]
         sparse = args_in[2]
-            
+
         z1 = zarr.open(zarr_loc, mode="r")
 
         if sparse:
diff --git a/src/eschr/tl/_prune_features.py b/src/eschr/tl/_prune_features.py
@@ -1,4 +1,5 @@
 """Feature selection and dimensionality reduction functions."""
+
 import time
 import warnings
 from typing import Optional
diff --git a/src/eschr/tl/_zarr_utils.py b/src/eschr/tl/_zarr_utils.py
@@ -1,9 +1,9 @@
 import os
-import traceback
-import warnings
+
 import zarr
 from scipy.sparse import coo_matrix
 
+
 def make_zarr_sparse(adata, zarr_loc):
     """
     Make zarr data store.
diff --git a/src/eschr/tl/main.py b/src/eschr/tl/main.py
@@ -1,24 +1,20 @@
 ## Import packages=============================================================
-import math
 import multiprocessing
 import os
-import random
 import time
-import traceback
 import warnings
 from itertools import repeat
 
 import numpy as np
 import pandas as pd
-import zarr
 from scipy.sparse import coo_matrix, csr_matrix, hstack
 from scipy.spatial.distance import pdist, squareform
 from sklearn import metrics
 
-from ._zarr_utils import (make_zarr_sparse, make_zarr_dense)
-from ._clustering import (run_base_clustering, consensus_cluster_leiden)
+from ._clustering import consensus_cluster_leiden, run_base_clustering
 from ._prune_features import (  # ADD BACK PRECEDING DOTS
     calc_highly_variable_genes, calc_pca)
+from ._zarr_utils import make_zarr_dense, make_zarr_sparse
 
 ## End Import packages section=================================================
 
@@ -182,7 +178,7 @@ def ensemble(
     out = parmap(run_base_clustering, args, nprocs=nprocs)
 
     try:
-        clust_out = hstack(out)  
+        clust_out = hstack(out)
     except Exception:
         print(
             "consensus_cluster.py, line 599, in ensemble: clust_out = hstack(out[:,0])"
@@ -250,7 +246,11 @@ def consensus(n, bg, nprocs):
     finish_time = time.perf_counter()
     print(f"Consensus clustering finished in {finish_time-start_time} seconds")
 
-    return hard_clusters, soft_membership_matrix, all_clusterings_df.to_numpy(dtype=np.uint16) 
+    return (
+        hard_clusters,
+        soft_membership_matrix,
+        all_clusterings_df.to_numpy(dtype=np.uint16),
+    )
 
 
 def consensus_cluster(
@@ -346,7 +346,7 @@ def consensus_cluster(
     la_res_range = (
         int(la_res_range[0]),
         int(la_res_range[1]),
-    )  
+    )
     bipartite = ensemble(
         zarr_loc=zarr_loc,
         reduction=reduction,
diff --git a/tests/test_eschr.py b/tests/test_eschr.py
@@ -72,6 +72,7 @@ def test_make_zarr_custom_path(adata, zarr_loc):
     assert os.path.exists(zarr_loc)
     shutil.rmtree(zarr_loc)
 
+
 @pytest.mark.skip(reason="Update to be testing zarr dense data structure")
 def test_make_zarr_content(adata, zarr_loc):
     es.tl._zarr_utils.make_zarr_dense(adata, zarr_loc)
@@ -242,7 +243,7 @@ def test_get_hard_soft_clusters_single_cluster(setup_data):
     )
 
     soft_membership_matrix = soft_membership_matrix.toarray()
-    
+
     # Test hard cluster assignments
     assert np.all(hard_clusters == 0)
 
@@ -261,14 +262,15 @@ def test_consensus_cluster_leiden(bipartite_graph_array):
         resolution,
     ) = es.tl._clustering.consensus_cluster_leiden(in_args)
 
-    #assert isinstance(hard_clusters, pd.Categorical)
+    # assert isinstance(hard_clusters, pd.Categorical)
     assert len(hard_clusters) == n
     assert isinstance(soft_membership_matrix, csr_matrix)
     assert soft_membership_matrix.shape[0] == n
     assert soft_membership_matrix.shape[1] >= np.unique(hard_clusters).shape[0]
     assert np.allclose(soft_membership_matrix.sum(axis=1), 1.0)
     assert resolution == 1.0
 
+
 # Test ensemble function
 @pytest.fixture
 def ensemble_args(zarr_loc_static):
@@ -280,83 +282,83 @@ def ensemble_args(zarr_loc_static):
         "k_range": (15, 150),
         "la_res_range": (25, 175),
         "nprocs": 1,
-        "sparse": False
+        "sparse": False,
     }
 
+
 def test_ensemble(ensemble_args):
     result = es.tl.main.ensemble(**ensemble_args)
     assert isinstance(result, coo_matrix)
-    
+
     # The shape should be (n_cells, n_clusters_total)
     z1 = zarr.open(ensemble_args["zarr_loc"], mode="r")
     n_cells = z1["X"].shape[0]
     assert result.shape[0] == n_cells
-    
+
     # There should be at least one cluster for each member in the ensemble
     assert result.shape[1] >= 3
-    
+
+
 # Test consensus function
 @pytest.fixture
 def consensus_args(bipartite_graph_array):
     n = np.max(bipartite_graph_array.row) + 1
-    return {
-        "n": n,
-        "bg": bipartite_graph_array,
-        "nprocs": 1
-    }
+    return {"n": n, "bg": bipartite_graph_array, "nprocs": 1}
+
 
 def test_consensus(consensus_args):
-    hard_clusters, soft_membership_matrix, all_clusterings = es.tl.main.consensus(**consensus_args)
-    
+    hard_clusters, soft_membership_matrix, all_clusterings = es.tl.main.consensus(
+        **consensus_args
+    )
+
     # Check hard clusters
     assert len(hard_clusters) == consensus_args["n"]
     assert isinstance(hard_clusters, np.ndarray)
-    
+
     # Check soft membership matrix
     assert soft_membership_matrix.shape[0] == consensus_args["n"]
     assert np.allclose(soft_membership_matrix.sum(axis=1), 1.0)
-    
+
     # Check all_clusterings
     assert isinstance(all_clusterings, np.ndarray)
     assert all_clusterings.shape[0] == consensus_args["n"]
     # Should have multiple resolutions tested
     assert all_clusterings.shape[1] > 1
-    
+
+
 # Test main consensus_cluster function
 def test_consensus_cluster_basic(adata, zarr_loc):
-    
+
     # Run the full pipeline with minimal parameters
     result_adata = es.tl.consensus_cluster(
-        adata, 
-        zarr_loc=zarr_loc,
-        ensemble_size=3,  # Small for testing
-        nprocs=1
+        adata, zarr_loc=zarr_loc, ensemble_size=3, nprocs=1  # Small for testing
     )
-    
+
     # Check that results are added to adata object
     assert "hard_clusters" in result_adata.obs
     assert "soft_membership_matrix" in result_adata.obsm
     assert "uncertainty_score" in result_adata.obs
     assert "bipartite" in result_adata.obsm
-    
+
     # Check shapes
     assert len(result_adata.obs["hard_clusters"]) == adata.shape[0]
     assert result_adata.obsm["soft_membership_matrix"].shape[0] == adata.shape[0]
-    
+
     # Check that multiresolution results are not included by default
     assert "multiresolution_clusters" not in result_adata.obsm
 
+
 def test_consensus_cluster_with_multires(adata, zarr_loc):
-    
+
     # Run with return_multires=True
     result_adata = es.tl.consensus_cluster(
-        adata, 
+        adata,
         zarr_loc=zarr_loc,
         ensemble_size=3,  # Small for testing
         nprocs=1,
-        return_multires=True
+        return_multires=True,
     )
-    
+
     # Check that multiresolution results are included
     assert "multiresolution_clusters" in result_adata.obsm
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""Feature selection and dimensionality reduction functions."""`
	`2`	`+`
`2`	`3`	`import time`
`3`	`4`	`import warnings`
`4`	`5`	`from typing import Optional`