Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hotspot/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name = "hotspot"

from .hotspot import Hotspot
from .gpu import is_gpu_available

# https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094
# https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302
Expand Down
66 changes: 66 additions & 0 deletions hotspot/gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""
GPU utilities for Hotspot. Provides CuPy availability checks and
sparse matrix construction helpers used by the GPU paths in
local_stats.py and local_stats_pairs.py.
"""

import numpy as np

try:
import cupy as cp
import cupyx.scipy.sparse as cp_sparse

HAS_CUPY = True
except ImportError:
HAS_CUPY = False


def is_gpu_available():
"""Check whether GPU acceleration is available (CuPy installed + CUDA device present)."""
if not HAS_CUPY:
return False
try:
cp.cuda.Device(0).compute_capability
return True
except Exception:
return False


def _require_gpu():
"""Raise an informative error if GPU is not available."""
if not HAS_CUPY:
raise ImportError(
"CuPy is required for GPU acceleration. "
"Install with: pip install hotspotsc[gpu] "
"(or install cupy separately for your CUDA version, "
"e.g. pip install cupy-cuda12x)"
)
try:
cp.cuda.Device(0).compute_capability
except Exception as e:
raise RuntimeError(
"No CUDA-capable GPU device found. "
"GPU acceleration requires an NVIDIA GPU with CUDA support."
) from e


def _build_sparse_weight_matrix(neighbors, weights, shape, square=False):
"""Build a CuPy sparse CSR matrix from neighbor/weight arrays.

W[i, neighbors[i,k]] = weights[i,k] for all i, k where weights[i,k] != 0.
If square=True, uses weights^2 instead (for moment computations).
"""
N, K = neighbors.shape
rows = np.repeat(np.arange(N, dtype=np.int32), K)
cols = neighbors.ravel().astype(np.int32)
vals = weights.ravel().astype(np.float64)
if square:
vals = vals ** 2

mask = vals != 0
rows, cols, vals = rows[mask], cols[mask], vals[mask]

return cp_sparse.csr_matrix(
(cp.asarray(vals), (cp.asarray(rows), cp.asarray(cols))),
shape=shape,
)
16 changes: 16 additions & 0 deletions hotspot/hotspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from . import modules
from .plots import local_correlation_plot
from .gpu import is_gpu_available
from tqdm import tqdm


Expand All @@ -29,6 +30,7 @@ def __init__(
distances_obsp_key=None,
tree=None,
umi_counts_obs_key=None,
use_gpu=False,
):
"""Initialize a Hotspot object for analysis

Expand Down Expand Up @@ -60,6 +62,10 @@ def __init__(
umi_counts_obs_key : str
Total umi count per cell. Used as a size factor.
If omitted, the sum over genes in the counts matrix is used
use_gpu : bool, optional
Whether to use GPU acceleration via CuPy for embarrassingly
parallel operations. Requires CuPy and a CUDA-capable GPU.
Default is False.
"""
counts = self._counts_from_anndata(adata, layer_key)
distances = (
Expand Down Expand Up @@ -166,6 +172,14 @@ def __init__(
self.linkage = None
self.module_scores = None

if use_gpu:
if not is_gpu_available():
raise RuntimeError(
"use_gpu=True but GPU is not available. "
"Ensure CuPy is installed and a CUDA GPU is present."
)
self.use_gpu = use_gpu

@classmethod
def legacy_init(
cls,
Expand Down Expand Up @@ -416,6 +430,7 @@ def _compute_hotspot(self, jobs=1):
genes=self.adata.var_names,
centered=True,
jobs=jobs,
use_gpu=self.use_gpu,
)

self.results = results
Expand Down Expand Up @@ -486,6 +501,7 @@ def compute_local_correlations(self, genes, jobs=1):
self.umi_counts,
self.model,
jobs=jobs,
use_gpu=self.use_gpu,
)

self.local_correlation_c = lc
Expand Down
160 changes: 143 additions & 17 deletions hotspot/local_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,16 @@ def initializer(neighbors, weights, num_umi, model, centered, Wtot2, D):
g_D = D

def compute_hs(
counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1
counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1,
use_gpu=False
):

if use_gpu:
results = _compute_hs_gpu(
counts, neighbors, weights, num_umi, model, genes, centered
)
return _postprocess_results(results)

neighbors = neighbors.values
weights = weights.values
num_umi = num_umi.values
Expand All @@ -202,16 +209,16 @@ def data_iter():

if jobs > 1:
with multiprocessing.Pool(
processes=jobs,
initializer=initializer,
processes=jobs,
initializer=initializer,
initargs=[neighbors, weights, num_umi, model, centered, Wtot2, D]
) as pool:
results = list(
tqdm(
pool.imap(
_map_fun_parallel,
data_iter()
),
),
total=counts.shape[0]
)
)
Expand All @@ -226,26 +233,23 @@ def _map_fun(vals):

results = pd.DataFrame(results, index=genes, columns=["G", "EG", "stdG", "Z", "C"])

return _postprocess_results(results)


def _postprocess_results(results):
results["Pval"] = norm.sf(results["Z"].values)
results["FDR"] = multipletests(results["Pval"], method="fdr_bh")[1]

results = results.sort_values("Z", ascending=False)
results.index.name = "Gene"

results = results[["C", "Z", "Pval", "FDR"]] # Remove other columns

results = results[["C", "Z", "Pval", "FDR"]]
return results


def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D):
"""
Note, since this is an inner function, for parallelization to work well
none of the contents of the function can use MKL or OPENBLAS threads.
Or else we open too many. Because of this, some simple numpy operations
are re-implemented using numba instead as it's difficult to control
the number of threads in numpy after it's imported
"""
def _fit_gene(vals, model, num_umi):
"""Fit a gene model and return (vals, mu, var, x2).

For the bernoulli model, vals is binarized before fitting.
"""
if model == "bernoulli":
vals = (vals > 0).astype("double")
mu, var, x2 = bernoulli_model.fit_gene_model(vals, num_umi)
Expand All @@ -256,7 +260,20 @@ def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2,
elif model == "none":
mu, var, x2 = none_model.fit_gene_model(vals, num_umi)
else:
raise Exception("Invalid Model: {}".format(model))
raise ValueError("Invalid Model: {}".format(model))
return vals, mu, var, x2


def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D):
"""
Note, since this is an inner function, for parallelization to work well
none of the contents of the function can use MKL or OPENBLAS threads.
Or else we open too many. Because of this, some simple numpy operations
are re-implemented using numba instead as it's difficult to control
the number of threads in numpy after it's imported
"""

vals, mu, var, x2 = _fit_gene(vals, model, num_umi)

if centered:
vals = center_values(vals, mu, var)
Expand Down Expand Up @@ -289,3 +306,112 @@ def _map_fun_parallel(vals):
return _compute_hs_inner(
vals, g_neighbors, g_weights, g_num_umi, g_model, g_centered, g_Wtot2, g_D
)


def _local_cov_weights_gpu(vals_gpu, W):
"""GPU batch of local_cov_weights: G[g] = vals[g] . (W @ vals[g]) for all genes."""
smoothed_T = W @ vals_gpu.T
return (vals_gpu * smoothed_T.T).sum(axis=1)


def _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq):
"""GPU batch of compute_moments_weights for all genes at once."""
# EG[g] = mu[g] . (W @ mu[g])
EG = (mu_gpu * (W @ mu_gpu.T).T).sum(axis=1)

# t1[g] = (W + W.T) @ mu[g], t2[g] = (W_sq + W_sq.T) @ mu[g]^2
W_sym = W + W.T
W_sq_sym = W_sq + W_sq.T
mu2_gpu = mu_gpu ** 2

t1_T = W_sym @ mu_gpu.T
t2_T = W_sq_sym @ mu2_gpu.T

# Contribution 1: sum_i (x2[i] - mu[i]^2) * (t1[i]^2 - t2[i])
diff_var = (x2_gpu - mu2_gpu).T
eg2_c1 = (diff_var * (t1_T ** 2 - t2_T)).sum(axis=0)

# Contribution 2: sum_{edges} w^2 * (x2[i]*x2[j] - mu[i]^2*mu[j]^2)
eg2_c2 = (x2_gpu.T * (W_sq @ x2_gpu.T)).sum(axis=0)
eg2_c2 -= (mu2_gpu.T * (W_sq @ mu2_gpu.T)).sum(axis=0)

EG2 = eg2_c1 + eg2_c2 + EG ** 2
return EG, EG2


def _compute_local_cov_max_gpu(D_gpu, vals_gpu):
"""GPU batch of compute_local_cov_max: G_max[g] = sum_i D[i]*vals[g,i]^2 / 2."""
return (D_gpu * vals_gpu ** 2).sum(axis=1) / 2


def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered):
"""
GPU-accelerated version of _compute_hs_inner, batched over all genes.
All genes are processed in parallel via sparse matrix multiplication.
"""
import cupy as cp
from .gpu import _require_gpu, _build_sparse_weight_matrix

_require_gpu()

neighbors_np = neighbors.values
weights_np = weights.values
num_umi_np = num_umi.values

N_genes = counts.shape[0]
N_cells = counts.shape[1]

D = compute_node_degree(neighbors_np, weights_np)
Wtot2 = (weights_np ** 2).sum()

if issparse(counts):
counts_dense = counts.toarray()
else:
counts_dense = np.asarray(counts)

all_vals = np.zeros((N_genes, N_cells), dtype="double")
if not centered:
all_mu = np.zeros((N_genes, N_cells), dtype="double")
all_x2 = np.zeros((N_genes, N_cells), dtype="double")

for i in range(N_genes):
raw = counts_dense[i].astype("double")

vals, mu, var, x2 = _fit_gene(raw, model, num_umi_np)
if centered:
vals = center_values(vals, mu, var)
else:
all_mu[i] = mu
all_x2[i] = x2
all_vals[i] = vals

vals_gpu = cp.asarray(all_vals)
D_gpu = cp.asarray(D)
W = _build_sparse_weight_matrix(neighbors_np, weights_np, shape=(N_cells, N_cells))

G_stats = _local_cov_weights_gpu(vals_gpu, W)

if centered:
EG = cp.zeros(N_genes, dtype="double")
EG2 = cp.full(N_genes, Wtot2, dtype="double")
else:
mu_gpu = cp.asarray(all_mu)
x2_gpu = cp.asarray(all_x2)
W_sq = _build_sparse_weight_matrix(
neighbors_np, weights_np, shape=(N_cells, N_cells), square=True
)
EG, EG2 = _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq)

stdG = (EG2 - EG * EG) ** 0.5
Z = (G_stats - EG) / stdG

G_max = _compute_local_cov_max_gpu(D_gpu, vals_gpu)
C = (G_stats - EG) / G_max

return pd.DataFrame(
{
"G": cp.asnumpy(G_stats), "EG": cp.asnumpy(EG),
"stdG": cp.asnumpy(stdG), "Z": cp.asnumpy(Z), "C": cp.asnumpy(C),
},
index=genes,
)
Loading
Loading