YosefLab · deto · Mar 20, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/hotspot/__init__.py b/hotspot/__init__.py
@@ -1,6 +1,7 @@
 name = "hotspot"
 
 from .hotspot import Hotspot
+from .gpu import is_gpu_available
 
 # https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094
 # https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302

diff --git a/hotspot/gpu.py b/hotspot/gpu.py
@@ -0,0 +1,66 @@
+"""
+GPU utilities for Hotspot. Provides CuPy availability checks and
+sparse matrix construction helpers used by the GPU paths in
+local_stats.py and local_stats_pairs.py.
+"""
+
+import numpy as np
+
+try:
+    import cupy as cp
+    import cupyx.scipy.sparse as cp_sparse
+
+    HAS_CUPY = True
+except ImportError:
+    HAS_CUPY = False
+
+
+def is_gpu_available():
+    """Check whether GPU acceleration is available (CuPy installed + CUDA device present)."""
+    if not HAS_CUPY:
+        return False
+    try:
+        cp.cuda.Device(0).compute_capability
+        return True
+    except Exception:
+        return False
+
+
+def _require_gpu():
+    """Raise an informative error if GPU is not available."""
+    if not HAS_CUPY:
+        raise ImportError(
+            "CuPy is required for GPU acceleration. "
+            "Install with: pip install hotspotsc[gpu]  "
+            "(or install cupy separately for your CUDA version, "
+            "e.g. pip install cupy-cuda12x)"
+        )
+    try:
+        cp.cuda.Device(0).compute_capability
+    except Exception as e:
+        raise RuntimeError(
+            "No CUDA-capable GPU device found. "
+            "GPU acceleration requires an NVIDIA GPU with CUDA support."
+        ) from e
+
+
+def _build_sparse_weight_matrix(neighbors, weights, shape, square=False):
+    """Build a CuPy sparse CSR matrix from neighbor/weight arrays.
+
+    W[i, neighbors[i,k]] = weights[i,k]  for all i, k where weights[i,k] != 0.
+    If square=True, uses weights^2 instead (for moment computations).
+    """
+    N, K = neighbors.shape
+    rows = np.repeat(np.arange(N, dtype=np.int32), K)
+    cols = neighbors.ravel().astype(np.int32)
+    vals = weights.ravel().astype(np.float64)
+    if square:
+        vals = vals ** 2
+
+    mask = vals != 0
+    rows, cols, vals = rows[mask], cols[mask], vals[mask]
+
+    return cp_sparse.csr_matrix(
+        (cp.asarray(vals), (cp.asarray(rows), cp.asarray(cols))),
+        shape=shape,
+    )
diff --git a/hotspot/hotspot.py b/hotspot/hotspot.py
@@ -16,6 +16,7 @@
 
 from . import modules
 from .plots import local_correlation_plot
+from .gpu import is_gpu_available
 from tqdm import tqdm
 
 
@@ -29,6 +30,7 @@ def __init__(
         distances_obsp_key=None,
         tree=None,
         umi_counts_obs_key=None,
+        use_gpu=False,
     ):
         """Initialize a Hotspot object for analysis
 
@@ -60,6 +62,10 @@ def __init__(
         umi_counts_obs_key : str
             Total umi count per cell.  Used as a size factor.
             If omitted, the sum over genes in the counts matrix is used
+        use_gpu : bool, optional
+            Whether to use GPU acceleration via CuPy for embarrassingly
+            parallel operations. Requires CuPy and a CUDA-capable GPU.
+            Default is False.
         """
         counts = self._counts_from_anndata(adata, layer_key)
         distances = (
@@ -166,6 +172,14 @@ def __init__(
         self.linkage = None
         self.module_scores = None
 
+        if use_gpu:
+            if not is_gpu_available():
+                raise RuntimeError(
+                    "use_gpu=True but GPU is not available. "
+                    "Ensure CuPy is installed and a CUDA GPU is present."
+                )
+        self.use_gpu = use_gpu
+
     @classmethod
     def legacy_init(
         cls,
@@ -416,6 +430,7 @@ def _compute_hotspot(self, jobs=1):
             genes=self.adata.var_names,
             centered=True,
             jobs=jobs,
+            use_gpu=self.use_gpu,
         )
 
         self.results = results
@@ -486,6 +501,7 @@ def compute_local_correlations(self, genes, jobs=1):
             self.umi_counts,
             self.model,
             jobs=jobs,
+            use_gpu=self.use_gpu,
         )
 
         self.local_correlation_c = lc

diff --git a/hotspot/local_stats.py b/hotspot/local_stats.py
@@ -182,9 +182,16 @@ def initializer(neighbors, weights, num_umi, model, centered, Wtot2, D):
     g_D = D
 
 def compute_hs(
-    counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1
+    counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1,
+    use_gpu=False
 ):
 
+    if use_gpu:
+        results = _compute_hs_gpu(
+            counts, neighbors, weights, num_umi, model, genes, centered
+        )
+        return _postprocess_results(results)
+
     neighbors = neighbors.values
     weights = weights.values
     num_umi = num_umi.values
@@ -202,16 +209,16 @@ def data_iter():
 
     if jobs > 1:
         with multiprocessing.Pool(
-            processes=jobs, 
-            initializer=initializer, 
+            processes=jobs,
+            initializer=initializer,
             initargs=[neighbors, weights, num_umi, model, centered, Wtot2, D]
         ) as pool:
             results = list(
                 tqdm(
                     pool.imap(
                         _map_fun_parallel,
                         data_iter()
-                    ), 
+                    ),
                     total=counts.shape[0]
                 )
             )
@@ -226,26 +233,23 @@ def _map_fun(vals):
 
     results = pd.DataFrame(results, index=genes, columns=["G", "EG", "stdG", "Z", "C"])
 
+    return _postprocess_results(results)
+
+
+def _postprocess_results(results):
     results["Pval"] = norm.sf(results["Z"].values)
     results["FDR"] = multipletests(results["Pval"], method="fdr_bh")[1]
-
     results = results.sort_values("Z", ascending=False)
     results.index.name = "Gene"
-
-    results = results[["C", "Z", "Pval", "FDR"]]  # Remove other columns
-
+    results = results[["C", "Z", "Pval", "FDR"]]
     return results
 
 
-def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D):
-    """
-    Note, since this is an inner function, for parallelization to work well
-    none of the contents of the function can use MKL or OPENBLAS threads.
-    Or else we open too many.  Because of this, some simple numpy operations
-    are re-implemented using numba instead as it's difficult to control
-    the number of threads in numpy after it's imported
-    """
+def _fit_gene(vals, model, num_umi):
+    """Fit a gene model and return (vals, mu, var, x2).
 
+    For the bernoulli model, vals is binarized before fitting.
+    """
     if model == "bernoulli":
         vals = (vals > 0).astype("double")
         mu, var, x2 = bernoulli_model.fit_gene_model(vals, num_umi)
@@ -256,7 +260,20 @@ def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2,
     elif model == "none":
         mu, var, x2 = none_model.fit_gene_model(vals, num_umi)
     else:
-        raise Exception("Invalid Model: {}".format(model))
+        raise ValueError("Invalid Model: {}".format(model))
+    return vals, mu, var, x2
+
+
+def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D):
+    """
+    Note, since this is an inner function, for parallelization to work well
+    none of the contents of the function can use MKL or OPENBLAS threads.
+    Or else we open too many.  Because of this, some simple numpy operations
+    are re-implemented using numba instead as it's difficult to control
+    the number of threads in numpy after it's imported
+    """
+
+    vals, mu, var, x2 = _fit_gene(vals, model, num_umi)
 
     if centered:
         vals = center_values(vals, mu, var)
@@ -289,3 +306,112 @@ def _map_fun_parallel(vals):
     return _compute_hs_inner(
         vals, g_neighbors, g_weights, g_num_umi, g_model, g_centered, g_Wtot2, g_D
     )
+
+
+def _local_cov_weights_gpu(vals_gpu, W):
+    """GPU batch of local_cov_weights: G[g] = vals[g] . (W @ vals[g]) for all genes."""
+    smoothed_T = W @ vals_gpu.T
+    return (vals_gpu * smoothed_T.T).sum(axis=1)
+
+
+def _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq):
+    """GPU batch of compute_moments_weights for all genes at once."""
+    # EG[g] = mu[g] . (W @ mu[g])
+    EG = (mu_gpu * (W @ mu_gpu.T).T).sum(axis=1)
+
+    # t1[g] = (W + W.T) @ mu[g],  t2[g] = (W_sq + W_sq.T) @ mu[g]^2
+    W_sym = W + W.T
+    W_sq_sym = W_sq + W_sq.T
+    mu2_gpu = mu_gpu ** 2
+
+    t1_T = W_sym @ mu_gpu.T
+    t2_T = W_sq_sym @ mu2_gpu.T
+
+    # Contribution 1: sum_i (x2[i] - mu[i]^2) * (t1[i]^2 - t2[i])
+    diff_var = (x2_gpu - mu2_gpu).T
+    eg2_c1 = (diff_var * (t1_T ** 2 - t2_T)).sum(axis=0)
+
+    # Contribution 2: sum_{edges} w^2 * (x2[i]*x2[j] - mu[i]^2*mu[j]^2)
+    eg2_c2 = (x2_gpu.T * (W_sq @ x2_gpu.T)).sum(axis=0)
+    eg2_c2 -= (mu2_gpu.T * (W_sq @ mu2_gpu.T)).sum(axis=0)
+
+    EG2 = eg2_c1 + eg2_c2 + EG ** 2
+    return EG, EG2
+
+
+def _compute_local_cov_max_gpu(D_gpu, vals_gpu):
+    """GPU batch of compute_local_cov_max: G_max[g] = sum_i D[i]*vals[g,i]^2 / 2."""
+    return (D_gpu * vals_gpu ** 2).sum(axis=1) / 2
+
+
+def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered):
+    """
+    GPU-accelerated version of _compute_hs_inner, batched over all genes.
+    All genes are processed in parallel via sparse matrix multiplication.
+    """
+    import cupy as cp
+    from .gpu import _require_gpu, _build_sparse_weight_matrix
+
+    _require_gpu()
+
+    neighbors_np = neighbors.values
+    weights_np = weights.values
+    num_umi_np = num_umi.values
+
+    N_genes = counts.shape[0]
+    N_cells = counts.shape[1]
+
+    D = compute_node_degree(neighbors_np, weights_np)
+    Wtot2 = (weights_np ** 2).sum()
+
+    if issparse(counts):
+        counts_dense = counts.toarray()
+    else:
+        counts_dense = np.asarray(counts)
+
+    all_vals = np.zeros((N_genes, N_cells), dtype="double")
+    if not centered:
+        all_mu = np.zeros((N_genes, N_cells), dtype="double")
+        all_x2 = np.zeros((N_genes, N_cells), dtype="double")
+
+    for i in range(N_genes):
+        raw = counts_dense[i].astype("double")
+
+        vals, mu, var, x2 = _fit_gene(raw, model, num_umi_np)
+        if centered:
+            vals = center_values(vals, mu, var)
+        else:
+            all_mu[i] = mu
+            all_x2[i] = x2
+        all_vals[i] = vals
+
+    vals_gpu = cp.asarray(all_vals)
+    D_gpu = cp.asarray(D)
+    W = _build_sparse_weight_matrix(neighbors_np, weights_np, shape=(N_cells, N_cells))
+
+    G_stats = _local_cov_weights_gpu(vals_gpu, W)
+
+    if centered:
+        EG = cp.zeros(N_genes, dtype="double")
+        EG2 = cp.full(N_genes, Wtot2, dtype="double")
+    else:
+        mu_gpu = cp.asarray(all_mu)
+        x2_gpu = cp.asarray(all_x2)
+        W_sq = _build_sparse_weight_matrix(
+            neighbors_np, weights_np, shape=(N_cells, N_cells), square=True
+        )
+        EG, EG2 = _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq)
+
+    stdG = (EG2 - EG * EG) ** 0.5
+    Z = (G_stats - EG) / stdG
+
+    G_max = _compute_local_cov_max_gpu(D_gpu, vals_gpu)
+    C = (G_stats - EG) / G_max
+
+    return pd.DataFrame(
+        {
+            "G": cp.asnumpy(G_stats), "EG": cp.asnumpy(EG),
+            "stdG": cp.asnumpy(stdG), "Z": cp.asnumpy(Z), "C": cp.asnumpy(C),
+        },
+        index=genes,
+    )