diff --git a/hotspot/__init__.py b/hotspot/__init__.py index 78588d3..4b05d90 100644 --- a/hotspot/__init__.py +++ b/hotspot/__init__.py @@ -1,6 +1,7 @@ name = "hotspot" from .hotspot import Hotspot +from .gpu import is_gpu_available # https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094 # https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302 diff --git a/hotspot/gpu.py b/hotspot/gpu.py new file mode 100644 index 0000000..7735312 --- /dev/null +++ b/hotspot/gpu.py @@ -0,0 +1,66 @@ +""" +GPU utilities for Hotspot. Provides CuPy availability checks and +sparse matrix construction helpers used by the GPU paths in +local_stats.py and local_stats_pairs.py. +""" + +import numpy as np + +try: + import cupy as cp + import cupyx.scipy.sparse as cp_sparse + + HAS_CUPY = True +except ImportError: + HAS_CUPY = False + + +def is_gpu_available(): + """Check whether GPU acceleration is available (CuPy installed + CUDA device present).""" + if not HAS_CUPY: + return False + try: + cp.cuda.Device(0).compute_capability + return True + except Exception: + return False + + +def _require_gpu(): + """Raise an informative error if GPU is not available.""" + if not HAS_CUPY: + raise ImportError( + "CuPy is required for GPU acceleration. " + "Install with: pip install hotspotsc[gpu] " + "(or install cupy separately for your CUDA version, " + "e.g. pip install cupy-cuda12x)" + ) + try: + cp.cuda.Device(0).compute_capability + except Exception as e: + raise RuntimeError( + "No CUDA-capable GPU device found. " + "GPU acceleration requires an NVIDIA GPU with CUDA support." + ) from e + + +def _build_sparse_weight_matrix(neighbors, weights, shape, square=False): + """Build a CuPy sparse CSR matrix from neighbor/weight arrays. + + W[i, neighbors[i,k]] = weights[i,k] for all i, k where weights[i,k] != 0. + If square=True, uses weights^2 instead (for moment computations). + """ + N, K = neighbors.shape + rows = np.repeat(np.arange(N, dtype=np.int32), K) + cols = neighbors.ravel().astype(np.int32) + vals = weights.ravel().astype(np.float64) + if square: + vals = vals ** 2 + + mask = vals != 0 + rows, cols, vals = rows[mask], cols[mask], vals[mask] + + return cp_sparse.csr_matrix( + (cp.asarray(vals), (cp.asarray(rows), cp.asarray(cols))), + shape=shape, + ) diff --git a/hotspot/hotspot.py b/hotspot/hotspot.py index 01dc45b..0f80467 100644 --- a/hotspot/hotspot.py +++ b/hotspot/hotspot.py @@ -16,6 +16,7 @@ from . import modules from .plots import local_correlation_plot +from .gpu import is_gpu_available from tqdm import tqdm @@ -29,6 +30,7 @@ def __init__( distances_obsp_key=None, tree=None, umi_counts_obs_key=None, + use_gpu=False, ): """Initialize a Hotspot object for analysis @@ -60,6 +62,10 @@ def __init__( umi_counts_obs_key : str Total umi count per cell. Used as a size factor. If omitted, the sum over genes in the counts matrix is used + use_gpu : bool, optional + Whether to use GPU acceleration via CuPy for embarrassingly + parallel operations. Requires CuPy and a CUDA-capable GPU. + Default is False. """ counts = self._counts_from_anndata(adata, layer_key) distances = ( @@ -166,6 +172,14 @@ def __init__( self.linkage = None self.module_scores = None + if use_gpu: + if not is_gpu_available(): + raise RuntimeError( + "use_gpu=True but GPU is not available. " + "Ensure CuPy is installed and a CUDA GPU is present." + ) + self.use_gpu = use_gpu + @classmethod def legacy_init( cls, @@ -416,6 +430,7 @@ def _compute_hotspot(self, jobs=1): genes=self.adata.var_names, centered=True, jobs=jobs, + use_gpu=self.use_gpu, ) self.results = results @@ -486,6 +501,7 @@ def compute_local_correlations(self, genes, jobs=1): self.umi_counts, self.model, jobs=jobs, + use_gpu=self.use_gpu, ) self.local_correlation_c = lc diff --git a/hotspot/local_stats.py b/hotspot/local_stats.py index 8d58d0a..a8813b8 100644 --- a/hotspot/local_stats.py +++ b/hotspot/local_stats.py @@ -182,9 +182,16 @@ def initializer(neighbors, weights, num_umi, model, centered, Wtot2, D): g_D = D def compute_hs( - counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1 + counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1, + use_gpu=False ): + if use_gpu: + results = _compute_hs_gpu( + counts, neighbors, weights, num_umi, model, genes, centered + ) + return _postprocess_results(results) + neighbors = neighbors.values weights = weights.values num_umi = num_umi.values @@ -202,8 +209,8 @@ def data_iter(): if jobs > 1: with multiprocessing.Pool( - processes=jobs, - initializer=initializer, + processes=jobs, + initializer=initializer, initargs=[neighbors, weights, num_umi, model, centered, Wtot2, D] ) as pool: results = list( @@ -211,7 +218,7 @@ def data_iter(): pool.imap( _map_fun_parallel, data_iter() - ), + ), total=counts.shape[0] ) ) @@ -226,26 +233,23 @@ def _map_fun(vals): results = pd.DataFrame(results, index=genes, columns=["G", "EG", "stdG", "Z", "C"]) + return _postprocess_results(results) + + +def _postprocess_results(results): results["Pval"] = norm.sf(results["Z"].values) results["FDR"] = multipletests(results["Pval"], method="fdr_bh")[1] - results = results.sort_values("Z", ascending=False) results.index.name = "Gene" - - results = results[["C", "Z", "Pval", "FDR"]] # Remove other columns - + results = results[["C", "Z", "Pval", "FDR"]] return results -def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D): - """ - Note, since this is an inner function, for parallelization to work well - none of the contents of the function can use MKL or OPENBLAS threads. - Or else we open too many. Because of this, some simple numpy operations - are re-implemented using numba instead as it's difficult to control - the number of threads in numpy after it's imported - """ +def _fit_gene(vals, model, num_umi): + """Fit a gene model and return (vals, mu, var, x2). + For the bernoulli model, vals is binarized before fitting. + """ if model == "bernoulli": vals = (vals > 0).astype("double") mu, var, x2 = bernoulli_model.fit_gene_model(vals, num_umi) @@ -256,7 +260,20 @@ def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, elif model == "none": mu, var, x2 = none_model.fit_gene_model(vals, num_umi) else: - raise Exception("Invalid Model: {}".format(model)) + raise ValueError("Invalid Model: {}".format(model)) + return vals, mu, var, x2 + + +def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D): + """ + Note, since this is an inner function, for parallelization to work well + none of the contents of the function can use MKL or OPENBLAS threads. + Or else we open too many. Because of this, some simple numpy operations + are re-implemented using numba instead as it's difficult to control + the number of threads in numpy after it's imported + """ + + vals, mu, var, x2 = _fit_gene(vals, model, num_umi) if centered: vals = center_values(vals, mu, var) @@ -289,3 +306,112 @@ def _map_fun_parallel(vals): return _compute_hs_inner( vals, g_neighbors, g_weights, g_num_umi, g_model, g_centered, g_Wtot2, g_D ) + + +def _local_cov_weights_gpu(vals_gpu, W): + """GPU batch of local_cov_weights: G[g] = vals[g] . (W @ vals[g]) for all genes.""" + smoothed_T = W @ vals_gpu.T + return (vals_gpu * smoothed_T.T).sum(axis=1) + + +def _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq): + """GPU batch of compute_moments_weights for all genes at once.""" + # EG[g] = mu[g] . (W @ mu[g]) + EG = (mu_gpu * (W @ mu_gpu.T).T).sum(axis=1) + + # t1[g] = (W + W.T) @ mu[g], t2[g] = (W_sq + W_sq.T) @ mu[g]^2 + W_sym = W + W.T + W_sq_sym = W_sq + W_sq.T + mu2_gpu = mu_gpu ** 2 + + t1_T = W_sym @ mu_gpu.T + t2_T = W_sq_sym @ mu2_gpu.T + + # Contribution 1: sum_i (x2[i] - mu[i]^2) * (t1[i]^2 - t2[i]) + diff_var = (x2_gpu - mu2_gpu).T + eg2_c1 = (diff_var * (t1_T ** 2 - t2_T)).sum(axis=0) + + # Contribution 2: sum_{edges} w^2 * (x2[i]*x2[j] - mu[i]^2*mu[j]^2) + eg2_c2 = (x2_gpu.T * (W_sq @ x2_gpu.T)).sum(axis=0) + eg2_c2 -= (mu2_gpu.T * (W_sq @ mu2_gpu.T)).sum(axis=0) + + EG2 = eg2_c1 + eg2_c2 + EG ** 2 + return EG, EG2 + + +def _compute_local_cov_max_gpu(D_gpu, vals_gpu): + """GPU batch of compute_local_cov_max: G_max[g] = sum_i D[i]*vals[g,i]^2 / 2.""" + return (D_gpu * vals_gpu ** 2).sum(axis=1) / 2 + + +def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered): + """ + GPU-accelerated version of _compute_hs_inner, batched over all genes. + All genes are processed in parallel via sparse matrix multiplication. + """ + import cupy as cp + from .gpu import _require_gpu, _build_sparse_weight_matrix + + _require_gpu() + + neighbors_np = neighbors.values + weights_np = weights.values + num_umi_np = num_umi.values + + N_genes = counts.shape[0] + N_cells = counts.shape[1] + + D = compute_node_degree(neighbors_np, weights_np) + Wtot2 = (weights_np ** 2).sum() + + if issparse(counts): + counts_dense = counts.toarray() + else: + counts_dense = np.asarray(counts) + + all_vals = np.zeros((N_genes, N_cells), dtype="double") + if not centered: + all_mu = np.zeros((N_genes, N_cells), dtype="double") + all_x2 = np.zeros((N_genes, N_cells), dtype="double") + + for i in range(N_genes): + raw = counts_dense[i].astype("double") + + vals, mu, var, x2 = _fit_gene(raw, model, num_umi_np) + if centered: + vals = center_values(vals, mu, var) + else: + all_mu[i] = mu + all_x2[i] = x2 + all_vals[i] = vals + + vals_gpu = cp.asarray(all_vals) + D_gpu = cp.asarray(D) + W = _build_sparse_weight_matrix(neighbors_np, weights_np, shape=(N_cells, N_cells)) + + G_stats = _local_cov_weights_gpu(vals_gpu, W) + + if centered: + EG = cp.zeros(N_genes, dtype="double") + EG2 = cp.full(N_genes, Wtot2, dtype="double") + else: + mu_gpu = cp.asarray(all_mu) + x2_gpu = cp.asarray(all_x2) + W_sq = _build_sparse_weight_matrix( + neighbors_np, weights_np, shape=(N_cells, N_cells), square=True + ) + EG, EG2 = _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq) + + stdG = (EG2 - EG * EG) ** 0.5 + Z = (G_stats - EG) / stdG + + G_max = _compute_local_cov_max_gpu(D_gpu, vals_gpu) + C = (G_stats - EG) / G_max + + return pd.DataFrame( + { + "G": cp.asnumpy(G_stats), "EG": cp.asnumpy(EG), + "stdG": cp.asnumpy(stdG), "Z": cp.asnumpy(Z), "C": cp.asnumpy(C), + }, + index=genes, + ) diff --git a/hotspot/local_stats_pairs.py b/hotspot/local_stats_pairs.py index 035dfa0..4d1cde5 100644 --- a/hotspot/local_stats_pairs.py +++ b/hotspot/local_stats_pairs.py @@ -9,7 +9,7 @@ from . import bernoulli_model from . import normal_model from . import none_model -from .local_stats import compute_local_cov_max +from .local_stats import compute_local_cov_max, _fit_gene from .knn import compute_node_degree from .utils import center_values @@ -449,25 +449,7 @@ def _compute_hs_pairs_inner(row_i, counts, neighbors, weights, num_umi, lc_out = np.zeros(counts.shape[0]) lc_z_out = np.zeros(counts.shape[0]) - if model == 'bernoulli': - vals_x = (vals_x > 0).astype('double') - mu_x, var_x, x2_x = bernoulli_model.fit_gene_model( - vals_x, num_umi) - - elif model == 'danb': - mu_x, var_x, x2_x = danb_model.fit_gene_model( - vals_x, num_umi) - - elif model == 'normal': - mu_x, var_x, x2_x = normal_model.fit_gene_model( - vals_x, num_umi) - - elif model == 'none': - mu_x, var_x, x2_x = none_model.fit_gene_model( - vals_x, num_umi) - - else: - raise Exception("Invalid Model: {}".format(model)) + vals_x, mu_x, var_x, x2_x = _fit_gene(vals_x, model, num_umi) if centered: vals_x = center_values(vals_x, mu_x, var_x) @@ -479,25 +461,7 @@ def _compute_hs_pairs_inner(row_i, counts, neighbors, weights, num_umi, vals_y = counts[row_j] - if model == 'bernoulli': - vals_y = (vals_y > 0).astype('double') - mu_y, var_y, x2_y = bernoulli_model.fit_gene_model( - vals_y, num_umi) - - elif model == 'danb': - mu_y, var_y, x2_y = danb_model.fit_gene_model( - vals_y, num_umi) - - elif model == 'normal': - mu_y, var_y, x2_y = normal_model.fit_gene_model( - vals_y, num_umi) - - elif model == 'none': - mu_x, var_x, x2_x = none_model.fit_gene_model( - vals_x, num_umi) - - else: - raise Exception("Invalid Model: {}".format(model)) + vals_y, mu_y, var_y, x2_y = _fit_gene(vals_y, model, num_umi) if centered: vals_y = center_values(vals_y, mu_y, var_y) @@ -744,7 +708,12 @@ def initializer3(counts, neighbors, weights, eg2s): def compute_hs_pairs_centered_cond(counts, neighbors, weights, - num_umi, model, jobs=1): + num_umi, model, jobs=1, use_gpu=False): + + if use_gpu: + return _compute_hs_pairs_centered_cond_gpu( + counts, neighbors, weights, num_umi, model + ) genes = counts.index @@ -876,3 +845,68 @@ def compute_local_cov_pairs_max(node_degrees, counts): result = gene_maxs.reshape((-1, 1)) + gene_maxs.reshape((1, -1)) result = result / 2 return result + + +def _conditional_eg2_gpu(X, W_sym): + """GPU batch of conditional_eg2: eg2[g] = ||(W + W.T) @ x[g]||^2.""" + t1x_T = W_sym @ X.T + return (t1x_T ** 2).sum(axis=0) + + +def _local_cov_pair_all_gpu(X, W): + """GPU batch of local_cov_pair for ALL gene pairs via dense matmul. + + Returns the full G x G matrix of lc values (= local_cov_pair * 2). + Diagonal is zeroed (no self-pairs). + """ + import cupy as cp + smoothed_T = W @ X.T # (N, G) + M = X @ smoothed_T # (G, G): M[a,b] = x_a . (W @ x_b) + lc_matrix = M + M.T # symmetrize: lc[a,b] = x_a.(Wx_b) + x_b.(Wx_a) + cp.fill_diagonal(lc_matrix, 0) + return lc_matrix + + +def _compute_hs_pairs_centered_cond_gpu(counts, neighbors, weights, num_umi, model): + """ + GPU-accelerated version of compute_hs_pairs_centered_cond, batched + over all gene pairs via dense matmul: M = X @ (W @ X.T), lc = M + M.T + """ + import cupy as cp + from .gpu import _require_gpu, _build_sparse_weight_matrix + + _require_gpu() + + genes = counts.index + counts_np = counts.values + neighbors_np = neighbors.values + weights_np = weights.values + num_umi_np = num_umi.values + N_cells = counts_np.shape[1] + + D = compute_node_degree(neighbors_np, weights_np) + + centered_counts = create_centered_counts(counts_np, model, num_umi_np) + + X = cp.asarray(centered_counts) + D_gpu = cp.asarray(D) + W = _build_sparse_weight_matrix(neighbors_np, weights_np, shape=(N_cells, N_cells)) + W_sym = W + W.T + + eg2s = _conditional_eg2_gpu(X, W_sym) + + lc_matrix = _local_cov_pair_all_gpu(X, W) + + std_genes = eg2s ** 0.5 + Z_xy = lc_matrix / std_genes[:, None] + Z_yx = lc_matrix / std_genes[None, :] + lc_zs = cp.where(cp.abs(Z_xy) < cp.abs(Z_yx), Z_xy, Z_yx) + + gene_maxs = (D_gpu * X ** 2).sum(axis=1) / 2 + lc_maxs = (gene_maxs[:, None] + gene_maxs[None, :]) / 2 + lcs = lc_matrix / lc_maxs + + lcs_df = pd.DataFrame(cp.asnumpy(lcs), index=genes, columns=genes) + lc_zs_df = pd.DataFrame(cp.asnumpy(lc_zs), index=genes, columns=genes) + + return lcs_df, lc_zs_df diff --git a/pyproject.toml b/pyproject.toml index 173289e..588267d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,15 +51,23 @@ myst-nb = {version = "*", optional = true} sphinx = {version = ">=4.1", optional = true} ipython = {version = "*", optional = true} scanpy = {version = "*", optional = true} +cupy-cuda12x = {version = ">=12.0", optional = true} [tool.poetry.extras] test = ["pytest", "scanpy"] +gpu = ["cupy-cuda12x"] docs=["sphinx-book-theme", "myst-nb", "sphinx", "ipython"] [tool.poetry.dev-dependencies] +[tool.pytest.ini_options] +markers = [ + "gpu: tests requiring a CUDA GPU and CuPy (deselected by default, run with: pytest -m gpu)", +] +addopts = "-m 'not gpu'" + [build-system] build-backend = "poetry.masonry.api" requires = [ diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..a80d3fe --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +import pytest + + +def pytest_configure(config): + config.addinivalue_line( + "markers", "gpu: tests requiring a CUDA GPU and CuPy" + ) diff --git a/tests/test_gpu.py b/tests/test_gpu.py new file mode 100644 index 0000000..95a2885 --- /dev/null +++ b/tests/test_gpu.py @@ -0,0 +1,507 @@ +""" +GPU equivalence and performance tests for Hotspot. + +These tests verify that GPU-accelerated computations produce results +numerically equivalent to their CPU counterparts, and benchmark speedup. + +All tests are marked with @pytest.mark.gpu and are excluded from CI by +default (no GPU resources). Run explicitly with: + + pytest -m gpu -v + +or to include GPU tests alongside regular tests: + + pytest -m '' -v + +Tolerance rationale: + GPU sparse matmul and CPU sequential Numba loops accumulate + floating-point sums in different order, yielding ~1e-7 relative + differences at float64 precision. We use rtol=1e-6 (10x margin). + +Note: + All tests share a single KNN graph between CPU and GPU runs to + isolate the GPU computation from pynndescent's stochastic KNN + construction. +""" + +import time + +import numpy as np +import pandas as pd +import pytest +import anndata +from scipy.sparse import csc_matrix + +from hotspot import Hotspot +from hotspot.gpu import is_gpu_available + +# Skip the entire module if no GPU is available +pytestmark = pytest.mark.gpu + +GPU_AVAILABLE = is_gpu_available() +skip_no_gpu = pytest.mark.skipif(not GPU_AVAILABLE, reason="No GPU available") + +# Tolerance for GPU vs CPU comparison (see module docstring) +RTOL = 1e-6 +ATOL = 1e-10 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_test_data(n_cells, n_genes, n_dim=10, seed=42): + """Create reproducible test data for GPU equivalence tests.""" + rng = np.random.RandomState(seed) + + latent = rng.normal(size=(n_cells, n_dim)) + latent_df = pd.DataFrame( + latent, + index=["Cell{}".format(i + 1) for i in range(n_cells)], + ) + + umi_counts = np.floor( + np.exp(rng.normal(loc=np.log(2000), scale=0.3, size=n_cells)) + ) + + gene_exp = rng.rand(n_genes, n_cells) * 10 + 0.1 + gene_names = ["Gene{}".format(i + 1) for i in range(n_genes)] + gene_exp_df = pd.DataFrame(gene_exp, index=gene_names, columns=latent_df.index) + + adata = anndata.AnnData(gene_exp_df.T) + adata.layers["sparse"] = csc_matrix(adata.X) + adata.obsm["latent"] = latent + adata.obs["umi_counts"] = umi_counts + + return adata, gene_names + + +def _make_cpu_gpu_pair(adata, model, layer_key=None, n_neighbors=30): + """Create CPU and GPU Hotspot objects sharing the same KNN graph.""" + hs_cpu = Hotspot( + adata, + model=model, + latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", + layer_key=layer_key, + use_gpu=False, + ) + hs_cpu.create_knn_graph(False, n_neighbors=n_neighbors) + + hs_gpu = Hotspot( + adata, + model=model, + latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", + layer_key=layer_key, + use_gpu=True, + ) + # Share the same KNN graph so we compare GPU vs CPU computation only + hs_gpu.neighbors = hs_cpu.neighbors + hs_gpu.weights = hs_cpu.weights + + return hs_cpu, hs_gpu + + +# --------------------------------------------------------------------------- +# compute_autocorrelations: GPU vs CPU equivalence (per model) +# --------------------------------------------------------------------------- + + +@skip_no_gpu +@pytest.mark.parametrize("model", ["danb", "bernoulli", "normal", "none"]) +def test_compute_autocorrelations_equivalence(model): + """GPU and CPU compute_autocorrelations produce equivalent results.""" + adata, genes = _make_test_data(n_cells=200, n_genes=20) + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, model) + + results_cpu = hs_cpu.compute_autocorrelations() + results_gpu = hs_gpu.compute_autocorrelations() + + # Align index ordering + results_cpu = results_cpu.loc[sorted(results_cpu.index)] + results_gpu = results_gpu.loc[sorted(results_gpu.index)] + + assert results_cpu.shape == results_gpu.shape + + np.testing.assert_allclose( + results_gpu["Z"].values, + results_cpu["Z"].values, + rtol=RTOL, + atol=ATOL, + err_msg=f"Z-score mismatch for model={model}", + ) + + np.testing.assert_allclose( + results_gpu["C"].values, + results_cpu["C"].values, + rtol=RTOL, + atol=ATOL, + err_msg=f"C value mismatch for model={model}", + ) + + np.testing.assert_allclose( + results_gpu["Pval"].values, + results_cpu["Pval"].values, + rtol=1e-4, + atol=1e-15, + err_msg=f"P-value mismatch for model={model}", + ) + + np.testing.assert_allclose( + results_gpu["FDR"].values, + results_cpu["FDR"].values, + rtol=1e-4, + atol=1e-15, + err_msg=f"FDR mismatch for model={model}", + ) + + +# --------------------------------------------------------------------------- +# compute_local_correlations: GPU vs CPU equivalence (per model) +# --------------------------------------------------------------------------- + + +@skip_no_gpu +@pytest.mark.parametrize("model", ["danb", "bernoulli", "normal", "none"]) +def test_compute_local_correlations_equivalence(model): + """GPU and CPU compute_local_correlations produce equivalent results.""" + adata, genes = _make_test_data(n_cells=200, n_genes=15) + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, model) + + hs_cpu.compute_autocorrelations() + lcz_cpu = hs_cpu.compute_local_correlations(genes) + + hs_gpu.compute_autocorrelations() + lcz_gpu = hs_gpu.compute_local_correlations(genes) + + assert lcz_cpu.shape == lcz_gpu.shape + + np.testing.assert_allclose( + lcz_gpu.values, + lcz_cpu.values, + rtol=RTOL, + atol=ATOL, + err_msg=f"Local correlation Z mismatch for model={model}", + ) + + lc_cpu = hs_cpu.local_correlation_c + lc_gpu = hs_gpu.local_correlation_c + np.testing.assert_allclose( + lc_gpu.values, + lc_cpu.values, + rtol=RTOL, + atol=ATOL, + err_msg=f"Local correlation C mismatch for model={model}", + ) + + # Verify symmetry of GPU output + np.testing.assert_allclose( + lcz_gpu.values, lcz_gpu.values.T, rtol=1e-12, atol=1e-12, + err_msg="GPU Z-score matrix should be symmetric", + ) + np.testing.assert_allclose( + lc_gpu.values, lc_gpu.values.T, rtol=1e-12, atol=1e-12, + err_msg="GPU correlation matrix should be symmetric", + ) + + +# --------------------------------------------------------------------------- +# Sparse input equivalence +# --------------------------------------------------------------------------- + + +@skip_no_gpu +@pytest.mark.parametrize("model", ["danb", "normal"]) +def test_sparse_input_equivalence(model): + """GPU produces identical results with sparse vs dense input.""" + adata, genes = _make_test_data(n_cells=150, n_genes=12) + + # Dense + hs_dense = Hotspot( + adata, model=model, latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", layer_key=None, use_gpu=True, + ) + hs_dense.create_knn_graph(False, n_neighbors=30) + results_dense = hs_dense.compute_autocorrelations() + + # Sparse, sharing the same KNN graph + hs_sparse = Hotspot( + adata, model=model, latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", layer_key="sparse", use_gpu=True, + ) + hs_sparse.neighbors = hs_dense.neighbors + hs_sparse.weights = hs_dense.weights + results_sparse = hs_sparse.compute_autocorrelations() + + results_dense = results_dense.loc[sorted(results_dense.index)] + results_sparse = results_sparse.loc[sorted(results_sparse.index)] + + np.testing.assert_allclose( + results_sparse["Z"].values, results_dense["Z"].values, + rtol=RTOL, atol=ATOL, err_msg="Sparse vs dense GPU results differ", + ) + + +# --------------------------------------------------------------------------- +# Different data sizes +# --------------------------------------------------------------------------- + + +@skip_no_gpu +@pytest.mark.parametrize( + "n_cells,n_genes,n_neighbors", + [(50, 5, 10), (100, 10, 20), (300, 25, 30)], +) +def test_varying_sizes(n_cells, n_genes, n_neighbors): + """GPU matches CPU across different dataset sizes.""" + adata, genes = _make_test_data(n_cells=n_cells, n_genes=n_genes) + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "normal", n_neighbors=n_neighbors) + + results_cpu = hs_cpu.compute_autocorrelations() + results_gpu = hs_gpu.compute_autocorrelations() + + results_cpu = results_cpu.loc[sorted(results_cpu.index)] + results_gpu = results_gpu.loc[sorted(results_gpu.index)] + + np.testing.assert_allclose( + results_gpu["Z"].values, results_cpu["Z"].values, + rtol=RTOL, atol=ATOL, + err_msg=f"Size {n_cells}x{n_genes} k={n_neighbors}: Z mismatch", + ) + + +# --------------------------------------------------------------------------- +# GPU determinism +# --------------------------------------------------------------------------- + + +@skip_no_gpu +def test_gpu_determinism(): + """Running GPU computation twice on the same graph produces identical results.""" + adata, genes = _make_test_data(n_cells=150, n_genes=15) + + hs = Hotspot( + adata, model="normal", latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", use_gpu=True, + ) + hs.create_knn_graph(False, n_neighbors=30) + + # First run + r1 = hs.compute_autocorrelations() + lcz1 = hs.compute_local_correlations(genes) + + # Second run (same object, same graph) + r2 = hs.compute_autocorrelations() + lcz2 = hs.compute_local_correlations(genes) + + r1 = r1.loc[sorted(r1.index)] + r2 = r2.loc[sorted(r2.index)] + + # Allow 1 ULP of float64 (~2.2e-16) from GPU parallel reduction non-determinism + np.testing.assert_allclose(r1.values, r2.values, rtol=0, atol=1e-15, + err_msg="GPU autocorrelation results are not deterministic") + np.testing.assert_allclose(lcz1.values, lcz2.values, rtol=0, atol=1e-15, + err_msg="GPU local correlation results are not deterministic") + + +# --------------------------------------------------------------------------- +# Full pipeline equivalence (end-to-end including modules) +# --------------------------------------------------------------------------- + + +@skip_no_gpu +@pytest.mark.parametrize("model", ["danb", "bernoulli", "normal", "none"]) +def test_full_pipeline_equivalence(model): + """Full Hotspot pipeline produces equivalent results with GPU and CPU.""" + adata, genes = _make_test_data(n_cells=200, n_genes=20) + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, model) + + # CPU pipeline + hs_cpu.compute_autocorrelations() + hs_cpu.compute_local_correlations(genes) + hs_cpu.create_modules(min_gene_threshold=2, fdr_threshold=1) + hs_cpu.calculate_module_scores() + + # GPU pipeline + hs_gpu.compute_autocorrelations() + hs_gpu.compute_local_correlations(genes) + hs_gpu.create_modules(min_gene_threshold=2, fdr_threshold=1) + hs_gpu.calculate_module_scores() + + r_cpu = hs_cpu.results.loc[sorted(hs_cpu.results.index)] + r_gpu = hs_gpu.results.loc[sorted(hs_gpu.results.index)] + np.testing.assert_allclose(r_gpu["Z"].values, r_cpu["Z"].values, rtol=RTOL, atol=ATOL) + + np.testing.assert_allclose( + hs_gpu.local_correlation_z.values, hs_cpu.local_correlation_z.values, + rtol=RTOL, atol=ATOL, + ) + + pd.testing.assert_series_equal( + hs_gpu.modules.sort_index(), hs_cpu.modules.sort_index(), check_names=True, + ) + + np.testing.assert_allclose( + hs_gpu.module_scores.values, hs_cpu.module_scores.values, + rtol=1e-6, atol=1e-6, + ) + + +# --------------------------------------------------------------------------- +# Diagonal of local correlation matrix should be zero +# --------------------------------------------------------------------------- + + +@skip_no_gpu +def test_local_correlation_diagonal_zero(): + """GPU local correlation matrix diagonal should be zero.""" + adata, genes = _make_test_data(n_cells=100, n_genes=10) + hs = Hotspot( + adata, model="normal", latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", use_gpu=True, + ) + hs.create_knn_graph(False, n_neighbors=30) + hs.compute_autocorrelations() + lcz = hs.compute_local_correlations(genes) + + np.testing.assert_array_equal(np.diag(lcz.values), np.zeros(len(genes))) + np.testing.assert_array_equal(np.diag(hs.local_correlation_c.values), np.zeros(len(genes))) + + +# --------------------------------------------------------------------------- +# Error handling +# --------------------------------------------------------------------------- + + +def test_use_gpu_without_gpu(): + """Setting use_gpu=True without GPU raises RuntimeError.""" + if GPU_AVAILABLE: + pytest.skip("GPU is available, cannot test missing-GPU error") + + adata, genes = _make_test_data(n_cells=50, n_genes=5) + with pytest.raises(RuntimeError, match="GPU is not available"): + Hotspot( + adata, model="normal", latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", use_gpu=True, + ) + + +# --------------------------------------------------------------------------- +# Benchmarks: quantify GPU speedup +# --------------------------------------------------------------------------- + + +@skip_no_gpu +def test_benchmark_compute_autocorrelations(): + """Benchmark GPU vs CPU speed for compute_autocorrelations.""" + adata, genes = _make_test_data(n_cells=500, n_genes=100, seed=123) + + # Warmup GPU + hs_w = Hotspot(adata, model="normal", latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", use_gpu=True) + hs_w.create_knn_graph(False, n_neighbors=30) + hs_w.compute_autocorrelations() + + # Shared graph for fair comparison + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "normal") + + t0 = time.perf_counter() + results_cpu = hs_cpu.compute_autocorrelations() + cpu_time = time.perf_counter() - t0 + + t0 = time.perf_counter() + results_gpu = hs_gpu.compute_autocorrelations() + gpu_time = time.perf_counter() - t0 + + speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf") + print(f"\n--- compute_autocorrelations benchmark (500 cells, 100 genes) ---") + print(f" CPU time: {cpu_time:.3f}s") + print(f" GPU time: {gpu_time:.3f}s") + print(f" Speedup: {speedup:.1f}x") + + results_cpu = results_cpu.loc[sorted(results_cpu.index)] + results_gpu = results_gpu.loc[sorted(results_gpu.index)] + np.testing.assert_allclose( + results_gpu["Z"].values, results_cpu["Z"].values, rtol=RTOL, atol=ATOL, + ) + + +@skip_no_gpu +def test_benchmark_compute_local_correlations(): + """Benchmark GPU vs CPU speed for compute_local_correlations.""" + adata, genes = _make_test_data(n_cells=500, n_genes=50, seed=456) + + # Warmup + hs_w = Hotspot(adata, model="normal", latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", use_gpu=True) + hs_w.create_knn_graph(False, n_neighbors=30) + hs_w.compute_autocorrelations() + hs_w.compute_local_correlations(genes) + + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "normal") + hs_cpu.compute_autocorrelations() + hs_gpu.compute_autocorrelations() + + t0 = time.perf_counter() + lcz_cpu = hs_cpu.compute_local_correlations(genes) + cpu_time = time.perf_counter() - t0 + + t0 = time.perf_counter() + lcz_gpu = hs_gpu.compute_local_correlations(genes) + gpu_time = time.perf_counter() - t0 + + speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf") + print(f"\n--- compute_local_correlations benchmark (500 cells, 50 genes) ---") + print(f" CPU time: {cpu_time:.3f}s") + print(f" GPU time: {gpu_time:.3f}s") + print(f" Speedup: {speedup:.1f}x") + + np.testing.assert_allclose(lcz_gpu.values, lcz_cpu.values, rtol=RTOL, atol=ATOL) + + +@skip_no_gpu +def test_benchmark_large_scale(): + """Benchmark at realistic scale: 1000 cells, 200 genes (danb model).""" + adata, genes = _make_test_data(n_cells=1000, n_genes=200, seed=789) + + # Warmup + hs_w = Hotspot(adata, model="danb", latent_obsm_key="latent", + umi_counts_obs_key="umi_counts", use_gpu=True) + hs_w.create_knn_graph(False, n_neighbors=30) + hs_w.compute_autocorrelations() + + hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "danb") + + t0 = time.perf_counter() + results_cpu = hs_cpu.compute_autocorrelations() + cpu_auto_time = time.perf_counter() - t0 + + subset_genes = genes[:80] + t0 = time.perf_counter() + lcz_cpu = hs_cpu.compute_local_correlations(subset_genes) + cpu_lc_time = time.perf_counter() - t0 + + t0 = time.perf_counter() + results_gpu = hs_gpu.compute_autocorrelations() + gpu_auto_time = time.perf_counter() - t0 + + t0 = time.perf_counter() + lcz_gpu = hs_gpu.compute_local_correlations(subset_genes) + gpu_lc_time = time.perf_counter() - t0 + + auto_speedup = cpu_auto_time / gpu_auto_time if gpu_auto_time > 0 else float("inf") + lc_speedup = cpu_lc_time / gpu_lc_time if gpu_lc_time > 0 else float("inf") + + print(f"\n--- Large-scale benchmark (1000 cells, 200/80 genes, danb model) ---") + print(f" compute_autocorrelations:") + print(f" CPU: {cpu_auto_time:.3f}s | GPU: {gpu_auto_time:.3f}s | Speedup: {auto_speedup:.1f}x") + print(f" compute_local_correlations (80 genes, {80*79//2} pairs):") + print(f" CPU: {cpu_lc_time:.3f}s | GPU: {gpu_lc_time:.3f}s | Speedup: {lc_speedup:.1f}x") + + results_cpu = results_cpu.loc[sorted(results_cpu.index)] + results_gpu = results_gpu.loc[sorted(results_gpu.index)] + np.testing.assert_allclose( + results_gpu["Z"].values, results_cpu["Z"].values, rtol=RTOL, atol=ATOL, + ) + np.testing.assert_allclose(lcz_gpu.values, lcz_cpu.values, rtol=RTOL, atol=ATOL)