diff --git a/hotspot/__init__.py b/hotspot/__init__.py
index 78588d3..4b05d90 100644
--- a/hotspot/__init__.py
+++ b/hotspot/__init__.py
@@ -1,6 +1,7 @@
 name = "hotspot"
 
 from .hotspot import Hotspot
+from .gpu import is_gpu_available
 
 # https://github.com/python-poetry/poetry/pull/2366#issuecomment-652418094
 # https://github.com/python-poetry/poetry/issues/144#issuecomment-623927302
diff --git a/hotspot/gpu.py b/hotspot/gpu.py
new file mode 100644
index 0000000..7735312
--- /dev/null
+++ b/hotspot/gpu.py
@@ -0,0 +1,66 @@
+"""
+GPU utilities for Hotspot. Provides CuPy availability checks and
+sparse matrix construction helpers used by the GPU paths in
+local_stats.py and local_stats_pairs.py.
+"""
+
+import numpy as np
+
+try:
+    import cupy as cp
+    import cupyx.scipy.sparse as cp_sparse
+
+    HAS_CUPY = True
+except ImportError:
+    HAS_CUPY = False
+
+
+def is_gpu_available():
+    """Check whether GPU acceleration is available (CuPy installed + CUDA device present)."""
+    if not HAS_CUPY:
+        return False
+    try:
+        cp.cuda.Device(0).compute_capability
+        return True
+    except Exception:
+        return False
+
+
+def _require_gpu():
+    """Raise an informative error if GPU is not available."""
+    if not HAS_CUPY:
+        raise ImportError(
+            "CuPy is required for GPU acceleration. "
+            "Install with: pip install hotspotsc[gpu]  "
+            "(or install cupy separately for your CUDA version, "
+            "e.g. pip install cupy-cuda12x)"
+        )
+    try:
+        cp.cuda.Device(0).compute_capability
+    except Exception as e:
+        raise RuntimeError(
+            "No CUDA-capable GPU device found. "
+            "GPU acceleration requires an NVIDIA GPU with CUDA support."
+        ) from e
+
+
+def _build_sparse_weight_matrix(neighbors, weights, shape, square=False):
+    """Build a CuPy sparse CSR matrix from neighbor/weight arrays.
+
+    W[i, neighbors[i,k]] = weights[i,k]  for all i, k where weights[i,k] != 0.
+    If square=True, uses weights^2 instead (for moment computations).
+    """
+    N, K = neighbors.shape
+    rows = np.repeat(np.arange(N, dtype=np.int32), K)
+    cols = neighbors.ravel().astype(np.int32)
+    vals = weights.ravel().astype(np.float64)
+    if square:
+        vals = vals ** 2
+
+    mask = vals != 0
+    rows, cols, vals = rows[mask], cols[mask], vals[mask]
+
+    return cp_sparse.csr_matrix(
+        (cp.asarray(vals), (cp.asarray(rows), cp.asarray(cols))),
+        shape=shape,
+    )
diff --git a/hotspot/hotspot.py b/hotspot/hotspot.py
index 01dc45b..0f80467 100644
--- a/hotspot/hotspot.py
+++ b/hotspot/hotspot.py
@@ -16,6 +16,7 @@
 
 from . import modules
 from .plots import local_correlation_plot
+from .gpu import is_gpu_available
 from tqdm import tqdm
 
 
@@ -29,6 +30,7 @@ def __init__(
         distances_obsp_key=None,
         tree=None,
         umi_counts_obs_key=None,
+        use_gpu=False,
     ):
         """Initialize a Hotspot object for analysis
 
@@ -60,6 +62,10 @@ def __init__(
         umi_counts_obs_key : str
             Total umi count per cell.  Used as a size factor.
             If omitted, the sum over genes in the counts matrix is used
+        use_gpu : bool, optional
+            Whether to use GPU acceleration via CuPy for embarrassingly
+            parallel operations. Requires CuPy and a CUDA-capable GPU.
+            Default is False.
         """
         counts = self._counts_from_anndata(adata, layer_key)
         distances = (
@@ -166,6 +172,14 @@ def __init__(
         self.linkage = None
         self.module_scores = None
 
+        if use_gpu:
+            if not is_gpu_available():
+                raise RuntimeError(
+                    "use_gpu=True but GPU is not available. "
+                    "Ensure CuPy is installed and a CUDA GPU is present."
+                )
+        self.use_gpu = use_gpu
+
     @classmethod
     def legacy_init(
         cls,
@@ -416,6 +430,7 @@ def _compute_hotspot(self, jobs=1):
             genes=self.adata.var_names,
             centered=True,
             jobs=jobs,
+            use_gpu=self.use_gpu,
         )
 
         self.results = results
@@ -486,6 +501,7 @@ def compute_local_correlations(self, genes, jobs=1):
             self.umi_counts,
             self.model,
             jobs=jobs,
+            use_gpu=self.use_gpu,
         )
 
         self.local_correlation_c = lc
diff --git a/hotspot/local_stats.py b/hotspot/local_stats.py
index 8d58d0a..a8813b8 100644
--- a/hotspot/local_stats.py
+++ b/hotspot/local_stats.py
@@ -182,9 +182,16 @@ def initializer(neighbors, weights, num_umi, model, centered, Wtot2, D):
     g_D = D
 
 def compute_hs(
-    counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1
+    counts, neighbors, weights, num_umi, model, genes, centered=False, jobs=1,
+    use_gpu=False
 ):
 
+    if use_gpu:
+        results = _compute_hs_gpu(
+            counts, neighbors, weights, num_umi, model, genes, centered
+        )
+        return _postprocess_results(results)
+
     neighbors = neighbors.values
     weights = weights.values
     num_umi = num_umi.values
@@ -202,8 +209,8 @@ def data_iter():
 
     if jobs > 1:
         with multiprocessing.Pool(
-            processes=jobs, 
-            initializer=initializer, 
+            processes=jobs,
+            initializer=initializer,
             initargs=[neighbors, weights, num_umi, model, centered, Wtot2, D]
         ) as pool:
             results = list(
@@ -211,7 +218,7 @@ def data_iter():
                     pool.imap(
                         _map_fun_parallel,
                         data_iter()
-                    ), 
+                    ),
                     total=counts.shape[0]
                 )
             )
@@ -226,26 +233,23 @@ def _map_fun(vals):
 
     results = pd.DataFrame(results, index=genes, columns=["G", "EG", "stdG", "Z", "C"])
 
+    return _postprocess_results(results)
+
+
+def _postprocess_results(results):
     results["Pval"] = norm.sf(results["Z"].values)
     results["FDR"] = multipletests(results["Pval"], method="fdr_bh")[1]
-
     results = results.sort_values("Z", ascending=False)
     results.index.name = "Gene"
-
-    results = results[["C", "Z", "Pval", "FDR"]]  # Remove other columns
-
+    results = results[["C", "Z", "Pval", "FDR"]]
     return results
 
 
-def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D):
-    """
-    Note, since this is an inner function, for parallelization to work well
-    none of the contents of the function can use MKL or OPENBLAS threads.
-    Or else we open too many.  Because of this, some simple numpy operations
-    are re-implemented using numba instead as it's difficult to control
-    the number of threads in numpy after it's imported
-    """
+def _fit_gene(vals, model, num_umi):
+    """Fit a gene model and return (vals, mu, var, x2).
 
+    For the bernoulli model, vals is binarized before fitting.
+    """
     if model == "bernoulli":
         vals = (vals > 0).astype("double")
         mu, var, x2 = bernoulli_model.fit_gene_model(vals, num_umi)
@@ -256,7 +260,20 @@ def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2,
     elif model == "none":
         mu, var, x2 = none_model.fit_gene_model(vals, num_umi)
     else:
-        raise Exception("Invalid Model: {}".format(model))
+        raise ValueError("Invalid Model: {}".format(model))
+    return vals, mu, var, x2
+
+
+def _compute_hs_inner(vals, neighbors, weights, num_umi, model, centered, Wtot2, D):
+    """
+    Note, since this is an inner function, for parallelization to work well
+    none of the contents of the function can use MKL or OPENBLAS threads.
+    Or else we open too many.  Because of this, some simple numpy operations
+    are re-implemented using numba instead as it's difficult to control
+    the number of threads in numpy after it's imported
+    """
+
+    vals, mu, var, x2 = _fit_gene(vals, model, num_umi)
 
     if centered:
         vals = center_values(vals, mu, var)
@@ -289,3 +306,112 @@ def _map_fun_parallel(vals):
     return _compute_hs_inner(
         vals, g_neighbors, g_weights, g_num_umi, g_model, g_centered, g_Wtot2, g_D
     )
+
+
+def _local_cov_weights_gpu(vals_gpu, W):
+    """GPU batch of local_cov_weights: G[g] = vals[g] . (W @ vals[g]) for all genes."""
+    smoothed_T = W @ vals_gpu.T
+    return (vals_gpu * smoothed_T.T).sum(axis=1)
+
+
+def _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq):
+    """GPU batch of compute_moments_weights for all genes at once."""
+    # EG[g] = mu[g] . (W @ mu[g])
+    EG = (mu_gpu * (W @ mu_gpu.T).T).sum(axis=1)
+
+    # t1[g] = (W + W.T) @ mu[g],  t2[g] = (W_sq + W_sq.T) @ mu[g]^2
+    W_sym = W + W.T
+    W_sq_sym = W_sq + W_sq.T
+    mu2_gpu = mu_gpu ** 2
+
+    t1_T = W_sym @ mu_gpu.T
+    t2_T = W_sq_sym @ mu2_gpu.T
+
+    # Contribution 1: sum_i (x2[i] - mu[i]^2) * (t1[i]^2 - t2[i])
+    diff_var = (x2_gpu - mu2_gpu).T
+    eg2_c1 = (diff_var * (t1_T ** 2 - t2_T)).sum(axis=0)
+
+    # Contribution 2: sum_{edges} w^2 * (x2[i]*x2[j] - mu[i]^2*mu[j]^2)
+    eg2_c2 = (x2_gpu.T * (W_sq @ x2_gpu.T)).sum(axis=0)
+    eg2_c2 -= (mu2_gpu.T * (W_sq @ mu2_gpu.T)).sum(axis=0)
+
+    EG2 = eg2_c1 + eg2_c2 + EG ** 2
+    return EG, EG2
+
+
+def _compute_local_cov_max_gpu(D_gpu, vals_gpu):
+    """GPU batch of compute_local_cov_max: G_max[g] = sum_i D[i]*vals[g,i]^2 / 2."""
+    return (D_gpu * vals_gpu ** 2).sum(axis=1) / 2
+
+
+def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered):
+    """
+    GPU-accelerated version of _compute_hs_inner, batched over all genes.
+    All genes are processed in parallel via sparse matrix multiplication.
+    """
+    import cupy as cp
+    from .gpu import _require_gpu, _build_sparse_weight_matrix
+
+    _require_gpu()
+
+    neighbors_np = neighbors.values
+    weights_np = weights.values
+    num_umi_np = num_umi.values
+
+    N_genes = counts.shape[0]
+    N_cells = counts.shape[1]
+
+    D = compute_node_degree(neighbors_np, weights_np)
+    Wtot2 = (weights_np ** 2).sum()
+
+    if issparse(counts):
+        counts_dense = counts.toarray()
+    else:
+        counts_dense = np.asarray(counts)
+
+    all_vals = np.zeros((N_genes, N_cells), dtype="double")
+    if not centered:
+        all_mu = np.zeros((N_genes, N_cells), dtype="double")
+        all_x2 = np.zeros((N_genes, N_cells), dtype="double")
+
+    for i in range(N_genes):
+        raw = counts_dense[i].astype("double")
+
+        vals, mu, var, x2 = _fit_gene(raw, model, num_umi_np)
+        if centered:
+            vals = center_values(vals, mu, var)
+        else:
+            all_mu[i] = mu
+            all_x2[i] = x2
+        all_vals[i] = vals
+
+    vals_gpu = cp.asarray(all_vals)
+    D_gpu = cp.asarray(D)
+    W = _build_sparse_weight_matrix(neighbors_np, weights_np, shape=(N_cells, N_cells))
+
+    G_stats = _local_cov_weights_gpu(vals_gpu, W)
+
+    if centered:
+        EG = cp.zeros(N_genes, dtype="double")
+        EG2 = cp.full(N_genes, Wtot2, dtype="double")
+    else:
+        mu_gpu = cp.asarray(all_mu)
+        x2_gpu = cp.asarray(all_x2)
+        W_sq = _build_sparse_weight_matrix(
+            neighbors_np, weights_np, shape=(N_cells, N_cells), square=True
+        )
+        EG, EG2 = _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq)
+
+    stdG = (EG2 - EG * EG) ** 0.5
+    Z = (G_stats - EG) / stdG
+
+    G_max = _compute_local_cov_max_gpu(D_gpu, vals_gpu)
+    C = (G_stats - EG) / G_max
+
+    return pd.DataFrame(
+        {
+            "G": cp.asnumpy(G_stats), "EG": cp.asnumpy(EG),
+            "stdG": cp.asnumpy(stdG), "Z": cp.asnumpy(Z), "C": cp.asnumpy(C),
+        },
+        index=genes,
+    )
diff --git a/hotspot/local_stats_pairs.py b/hotspot/local_stats_pairs.py
index 035dfa0..4d1cde5 100644
--- a/hotspot/local_stats_pairs.py
+++ b/hotspot/local_stats_pairs.py
@@ -9,7 +9,7 @@
 from . import bernoulli_model
 from . import normal_model
 from . import none_model
-from .local_stats import compute_local_cov_max
+from .local_stats import compute_local_cov_max, _fit_gene
 from .knn import compute_node_degree
 from .utils import center_values
 
@@ -449,25 +449,7 @@ def _compute_hs_pairs_inner(row_i, counts, neighbors, weights, num_umi,
     lc_out = np.zeros(counts.shape[0])
     lc_z_out = np.zeros(counts.shape[0])
 
-    if model == 'bernoulli':
-        vals_x = (vals_x > 0).astype('double')
-        mu_x, var_x, x2_x = bernoulli_model.fit_gene_model(
-            vals_x, num_umi)
-
-    elif model == 'danb':
-        mu_x, var_x, x2_x = danb_model.fit_gene_model(
-            vals_x, num_umi)
-
-    elif model == 'normal':
-        mu_x, var_x, x2_x = normal_model.fit_gene_model(
-            vals_x, num_umi)
-
-    elif model == 'none':
-        mu_x, var_x, x2_x = none_model.fit_gene_model(
-            vals_x, num_umi)
-
-    else:
-        raise Exception("Invalid Model: {}".format(model))
+    vals_x, mu_x, var_x, x2_x = _fit_gene(vals_x, model, num_umi)
 
     if centered:
         vals_x = center_values(vals_x, mu_x, var_x)
@@ -479,25 +461,7 @@ def _compute_hs_pairs_inner(row_i, counts, neighbors, weights, num_umi,
 
         vals_y = counts[row_j]
 
-        if model == 'bernoulli':
-            vals_y = (vals_y > 0).astype('double')
-            mu_y, var_y, x2_y = bernoulli_model.fit_gene_model(
-                vals_y, num_umi)
-
-        elif model == 'danb':
-            mu_y, var_y, x2_y = danb_model.fit_gene_model(
-                vals_y, num_umi)
-
-        elif model == 'normal':
-            mu_y, var_y, x2_y = normal_model.fit_gene_model(
-                vals_y, num_umi)
-
-        elif model == 'none':
-            mu_x, var_x, x2_x = none_model.fit_gene_model(
-                vals_x, num_umi)
-
-        else:
-            raise Exception("Invalid Model: {}".format(model))
+        vals_y, mu_y, var_y, x2_y = _fit_gene(vals_y, model, num_umi)
 
         if centered:
             vals_y = center_values(vals_y, mu_y, var_y)
@@ -744,7 +708,12 @@ def initializer3(counts, neighbors, weights, eg2s):
 
 
 def compute_hs_pairs_centered_cond(counts, neighbors, weights,
-                                   num_umi, model, jobs=1):
+                                   num_umi, model, jobs=1, use_gpu=False):
+
+    if use_gpu:
+        return _compute_hs_pairs_centered_cond_gpu(
+            counts, neighbors, weights, num_umi, model
+        )
 
     genes = counts.index
 
@@ -876,3 +845,68 @@ def compute_local_cov_pairs_max(node_degrees, counts):
     result = gene_maxs.reshape((-1, 1)) + gene_maxs.reshape((1, -1))
     result = result / 2
     return result
+
+
+def _conditional_eg2_gpu(X, W_sym):
+    """GPU batch of conditional_eg2: eg2[g] = ||(W + W.T) @ x[g]||^2."""
+    t1x_T = W_sym @ X.T
+    return (t1x_T ** 2).sum(axis=0)
+
+
+def _local_cov_pair_all_gpu(X, W):
+    """GPU batch of local_cov_pair for ALL gene pairs via dense matmul.
+
+    Returns the full G x G matrix of lc values (= local_cov_pair * 2).
+    Diagonal is zeroed (no self-pairs).
+    """
+    import cupy as cp
+    smoothed_T = W @ X.T                # (N, G)
+    M = X @ smoothed_T                  # (G, G):  M[a,b] = x_a . (W @ x_b)
+    lc_matrix = M + M.T                 # symmetrize: lc[a,b] = x_a.(Wx_b) + x_b.(Wx_a)
+    cp.fill_diagonal(lc_matrix, 0)
+    return lc_matrix
+
+
+def _compute_hs_pairs_centered_cond_gpu(counts, neighbors, weights, num_umi, model):
+    """
+    GPU-accelerated version of compute_hs_pairs_centered_cond, batched
+    over all gene pairs via dense matmul: M = X @ (W @ X.T), lc = M + M.T
+    """
+    import cupy as cp
+    from .gpu import _require_gpu, _build_sparse_weight_matrix
+
+    _require_gpu()
+
+    genes = counts.index
+    counts_np = counts.values
+    neighbors_np = neighbors.values
+    weights_np = weights.values
+    num_umi_np = num_umi.values
+    N_cells = counts_np.shape[1]
+
+    D = compute_node_degree(neighbors_np, weights_np)
+
+    centered_counts = create_centered_counts(counts_np, model, num_umi_np)
+
+    X = cp.asarray(centered_counts)
+    D_gpu = cp.asarray(D)
+    W = _build_sparse_weight_matrix(neighbors_np, weights_np, shape=(N_cells, N_cells))
+    W_sym = W + W.T
+
+    eg2s = _conditional_eg2_gpu(X, W_sym)
+
+    lc_matrix = _local_cov_pair_all_gpu(X, W)
+
+    std_genes = eg2s ** 0.5
+    Z_xy = lc_matrix / std_genes[:, None]
+    Z_yx = lc_matrix / std_genes[None, :]
+    lc_zs = cp.where(cp.abs(Z_xy) < cp.abs(Z_yx), Z_xy, Z_yx)
+
+    gene_maxs = (D_gpu * X ** 2).sum(axis=1) / 2
+    lc_maxs = (gene_maxs[:, None] + gene_maxs[None, :]) / 2
+    lcs = lc_matrix / lc_maxs
+
+    lcs_df = pd.DataFrame(cp.asnumpy(lcs), index=genes, columns=genes)
+    lc_zs_df = pd.DataFrame(cp.asnumpy(lc_zs), index=genes, columns=genes)
+
+    return lcs_df, lc_zs_df
diff --git a/pyproject.toml b/pyproject.toml
index 173289e..588267d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,15 +51,23 @@ myst-nb = {version = "*", optional = true}
 sphinx = {version = ">=4.1", optional = true}
 ipython = {version = "*", optional = true}
 scanpy = {version = "*", optional = true}
+cupy-cuda12x = {version = ">=12.0", optional = true}
 
 
 [tool.poetry.extras]
 test = ["pytest", "scanpy"]
+gpu = ["cupy-cuda12x"]
 docs=["sphinx-book-theme", "myst-nb", "sphinx", "ipython"]
 
 [tool.poetry.dev-dependencies]
 
 
+[tool.pytest.ini_options]
+markers = [
+    "gpu: tests requiring a CUDA GPU and CuPy (deselected by default, run with: pytest -m gpu)",
+]
+addopts = "-m 'not gpu'"
+
 [build-system]
 build-backend = "poetry.masonry.api"
 requires = [
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..a80d3fe
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+
+
+def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "gpu: tests requiring a CUDA GPU and CuPy"
+    )
diff --git a/tests/test_gpu.py b/tests/test_gpu.py
new file mode 100644
index 0000000..95a2885
--- /dev/null
+++ b/tests/test_gpu.py
@@ -0,0 +1,507 @@
+"""
+GPU equivalence and performance tests for Hotspot.
+
+These tests verify that GPU-accelerated computations produce results
+numerically equivalent to their CPU counterparts, and benchmark speedup.
+
+All tests are marked with @pytest.mark.gpu and are excluded from CI by
+default (no GPU resources). Run explicitly with:
+
+    pytest -m gpu -v
+
+or to include GPU tests alongside regular tests:
+
+    pytest -m '' -v
+
+Tolerance rationale:
+    GPU sparse matmul and CPU sequential Numba loops accumulate
+    floating-point sums in different order, yielding ~1e-7 relative
+    differences at float64 precision. We use rtol=1e-6 (10x margin).
+
+Note:
+    All tests share a single KNN graph between CPU and GPU runs to
+    isolate the GPU computation from pynndescent's stochastic KNN
+    construction.
+"""
+
+import time
+
+import numpy as np
+import pandas as pd
+import pytest
+import anndata
+from scipy.sparse import csc_matrix
+
+from hotspot import Hotspot
+from hotspot.gpu import is_gpu_available
+
+# Skip the entire module if no GPU is available
+pytestmark = pytest.mark.gpu
+
+GPU_AVAILABLE = is_gpu_available()
+skip_no_gpu = pytest.mark.skipif(not GPU_AVAILABLE, reason="No GPU available")
+
+# Tolerance for GPU vs CPU comparison (see module docstring)
+RTOL = 1e-6
+ATOL = 1e-10
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_test_data(n_cells, n_genes, n_dim=10, seed=42):
+    """Create reproducible test data for GPU equivalence tests."""
+    rng = np.random.RandomState(seed)
+
+    latent = rng.normal(size=(n_cells, n_dim))
+    latent_df = pd.DataFrame(
+        latent,
+        index=["Cell{}".format(i + 1) for i in range(n_cells)],
+    )
+
+    umi_counts = np.floor(
+        np.exp(rng.normal(loc=np.log(2000), scale=0.3, size=n_cells))
+    )
+
+    gene_exp = rng.rand(n_genes, n_cells) * 10 + 0.1
+    gene_names = ["Gene{}".format(i + 1) for i in range(n_genes)]
+    gene_exp_df = pd.DataFrame(gene_exp, index=gene_names, columns=latent_df.index)
+
+    adata = anndata.AnnData(gene_exp_df.T)
+    adata.layers["sparse"] = csc_matrix(adata.X)
+    adata.obsm["latent"] = latent
+    adata.obs["umi_counts"] = umi_counts
+
+    return adata, gene_names
+
+
+def _make_cpu_gpu_pair(adata, model, layer_key=None, n_neighbors=30):
+    """Create CPU and GPU Hotspot objects sharing the same KNN graph."""
+    hs_cpu = Hotspot(
+        adata,
+        model=model,
+        latent_obsm_key="latent",
+        umi_counts_obs_key="umi_counts",
+        layer_key=layer_key,
+        use_gpu=False,
+    )
+    hs_cpu.create_knn_graph(False, n_neighbors=n_neighbors)
+
+    hs_gpu = Hotspot(
+        adata,
+        model=model,
+        latent_obsm_key="latent",
+        umi_counts_obs_key="umi_counts",
+        layer_key=layer_key,
+        use_gpu=True,
+    )
+    # Share the same KNN graph so we compare GPU vs CPU computation only
+    hs_gpu.neighbors = hs_cpu.neighbors
+    hs_gpu.weights = hs_cpu.weights
+
+    return hs_cpu, hs_gpu
+
+
+# ---------------------------------------------------------------------------
+# compute_autocorrelations: GPU vs CPU equivalence (per model)
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+@pytest.mark.parametrize("model", ["danb", "bernoulli", "normal", "none"])
+def test_compute_autocorrelations_equivalence(model):
+    """GPU and CPU compute_autocorrelations produce equivalent results."""
+    adata, genes = _make_test_data(n_cells=200, n_genes=20)
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, model)
+
+    results_cpu = hs_cpu.compute_autocorrelations()
+    results_gpu = hs_gpu.compute_autocorrelations()
+
+    # Align index ordering
+    results_cpu = results_cpu.loc[sorted(results_cpu.index)]
+    results_gpu = results_gpu.loc[sorted(results_gpu.index)]
+
+    assert results_cpu.shape == results_gpu.shape
+
+    np.testing.assert_allclose(
+        results_gpu["Z"].values,
+        results_cpu["Z"].values,
+        rtol=RTOL,
+        atol=ATOL,
+        err_msg=f"Z-score mismatch for model={model}",
+    )
+
+    np.testing.assert_allclose(
+        results_gpu["C"].values,
+        results_cpu["C"].values,
+        rtol=RTOL,
+        atol=ATOL,
+        err_msg=f"C value mismatch for model={model}",
+    )
+
+    np.testing.assert_allclose(
+        results_gpu["Pval"].values,
+        results_cpu["Pval"].values,
+        rtol=1e-4,
+        atol=1e-15,
+        err_msg=f"P-value mismatch for model={model}",
+    )
+
+    np.testing.assert_allclose(
+        results_gpu["FDR"].values,
+        results_cpu["FDR"].values,
+        rtol=1e-4,
+        atol=1e-15,
+        err_msg=f"FDR mismatch for model={model}",
+    )
+
+
+# ---------------------------------------------------------------------------
+# compute_local_correlations: GPU vs CPU equivalence (per model)
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+@pytest.mark.parametrize("model", ["danb", "bernoulli", "normal", "none"])
+def test_compute_local_correlations_equivalence(model):
+    """GPU and CPU compute_local_correlations produce equivalent results."""
+    adata, genes = _make_test_data(n_cells=200, n_genes=15)
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, model)
+
+    hs_cpu.compute_autocorrelations()
+    lcz_cpu = hs_cpu.compute_local_correlations(genes)
+
+    hs_gpu.compute_autocorrelations()
+    lcz_gpu = hs_gpu.compute_local_correlations(genes)
+
+    assert lcz_cpu.shape == lcz_gpu.shape
+
+    np.testing.assert_allclose(
+        lcz_gpu.values,
+        lcz_cpu.values,
+        rtol=RTOL,
+        atol=ATOL,
+        err_msg=f"Local correlation Z mismatch for model={model}",
+    )
+
+    lc_cpu = hs_cpu.local_correlation_c
+    lc_gpu = hs_gpu.local_correlation_c
+    np.testing.assert_allclose(
+        lc_gpu.values,
+        lc_cpu.values,
+        rtol=RTOL,
+        atol=ATOL,
+        err_msg=f"Local correlation C mismatch for model={model}",
+    )
+
+    # Verify symmetry of GPU output
+    np.testing.assert_allclose(
+        lcz_gpu.values, lcz_gpu.values.T, rtol=1e-12, atol=1e-12,
+        err_msg="GPU Z-score matrix should be symmetric",
+    )
+    np.testing.assert_allclose(
+        lc_gpu.values, lc_gpu.values.T, rtol=1e-12, atol=1e-12,
+        err_msg="GPU correlation matrix should be symmetric",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Sparse input equivalence
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+@pytest.mark.parametrize("model", ["danb", "normal"])
+def test_sparse_input_equivalence(model):
+    """GPU produces identical results with sparse vs dense input."""
+    adata, genes = _make_test_data(n_cells=150, n_genes=12)
+
+    # Dense
+    hs_dense = Hotspot(
+        adata, model=model, latent_obsm_key="latent",
+        umi_counts_obs_key="umi_counts", layer_key=None, use_gpu=True,
+    )
+    hs_dense.create_knn_graph(False, n_neighbors=30)
+    results_dense = hs_dense.compute_autocorrelations()
+
+    # Sparse, sharing the same KNN graph
+    hs_sparse = Hotspot(
+        adata, model=model, latent_obsm_key="latent",
+        umi_counts_obs_key="umi_counts", layer_key="sparse", use_gpu=True,
+    )
+    hs_sparse.neighbors = hs_dense.neighbors
+    hs_sparse.weights = hs_dense.weights
+    results_sparse = hs_sparse.compute_autocorrelations()
+
+    results_dense = results_dense.loc[sorted(results_dense.index)]
+    results_sparse = results_sparse.loc[sorted(results_sparse.index)]
+
+    np.testing.assert_allclose(
+        results_sparse["Z"].values, results_dense["Z"].values,
+        rtol=RTOL, atol=ATOL, err_msg="Sparse vs dense GPU results differ",
+    )
+
+
+# ---------------------------------------------------------------------------
+# Different data sizes
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+@pytest.mark.parametrize(
+    "n_cells,n_genes,n_neighbors",
+    [(50, 5, 10), (100, 10, 20), (300, 25, 30)],
+)
+def test_varying_sizes(n_cells, n_genes, n_neighbors):
+    """GPU matches CPU across different dataset sizes."""
+    adata, genes = _make_test_data(n_cells=n_cells, n_genes=n_genes)
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "normal", n_neighbors=n_neighbors)
+
+    results_cpu = hs_cpu.compute_autocorrelations()
+    results_gpu = hs_gpu.compute_autocorrelations()
+
+    results_cpu = results_cpu.loc[sorted(results_cpu.index)]
+    results_gpu = results_gpu.loc[sorted(results_gpu.index)]
+
+    np.testing.assert_allclose(
+        results_gpu["Z"].values, results_cpu["Z"].values,
+        rtol=RTOL, atol=ATOL,
+        err_msg=f"Size {n_cells}x{n_genes} k={n_neighbors}: Z mismatch",
+    )
+
+
+# ---------------------------------------------------------------------------
+# GPU determinism
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+def test_gpu_determinism():
+    """Running GPU computation twice on the same graph produces identical results."""
+    adata, genes = _make_test_data(n_cells=150, n_genes=15)
+
+    hs = Hotspot(
+        adata, model="normal", latent_obsm_key="latent",
+        umi_counts_obs_key="umi_counts", use_gpu=True,
+    )
+    hs.create_knn_graph(False, n_neighbors=30)
+
+    # First run
+    r1 = hs.compute_autocorrelations()
+    lcz1 = hs.compute_local_correlations(genes)
+
+    # Second run (same object, same graph)
+    r2 = hs.compute_autocorrelations()
+    lcz2 = hs.compute_local_correlations(genes)
+
+    r1 = r1.loc[sorted(r1.index)]
+    r2 = r2.loc[sorted(r2.index)]
+
+    # Allow 1 ULP of float64 (~2.2e-16) from GPU parallel reduction non-determinism
+    np.testing.assert_allclose(r1.values, r2.values, rtol=0, atol=1e-15,
+        err_msg="GPU autocorrelation results are not deterministic")
+    np.testing.assert_allclose(lcz1.values, lcz2.values, rtol=0, atol=1e-15,
+        err_msg="GPU local correlation results are not deterministic")
+
+
+# ---------------------------------------------------------------------------
+# Full pipeline equivalence (end-to-end including modules)
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+@pytest.mark.parametrize("model", ["danb", "bernoulli", "normal", "none"])
+def test_full_pipeline_equivalence(model):
+    """Full Hotspot pipeline produces equivalent results with GPU and CPU."""
+    adata, genes = _make_test_data(n_cells=200, n_genes=20)
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, model)
+
+    # CPU pipeline
+    hs_cpu.compute_autocorrelations()
+    hs_cpu.compute_local_correlations(genes)
+    hs_cpu.create_modules(min_gene_threshold=2, fdr_threshold=1)
+    hs_cpu.calculate_module_scores()
+
+    # GPU pipeline
+    hs_gpu.compute_autocorrelations()
+    hs_gpu.compute_local_correlations(genes)
+    hs_gpu.create_modules(min_gene_threshold=2, fdr_threshold=1)
+    hs_gpu.calculate_module_scores()
+
+    r_cpu = hs_cpu.results.loc[sorted(hs_cpu.results.index)]
+    r_gpu = hs_gpu.results.loc[sorted(hs_gpu.results.index)]
+    np.testing.assert_allclose(r_gpu["Z"].values, r_cpu["Z"].values, rtol=RTOL, atol=ATOL)
+
+    np.testing.assert_allclose(
+        hs_gpu.local_correlation_z.values, hs_cpu.local_correlation_z.values,
+        rtol=RTOL, atol=ATOL,
+    )
+
+    pd.testing.assert_series_equal(
+        hs_gpu.modules.sort_index(), hs_cpu.modules.sort_index(), check_names=True,
+    )
+
+    np.testing.assert_allclose(
+        hs_gpu.module_scores.values, hs_cpu.module_scores.values,
+        rtol=1e-6, atol=1e-6,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Diagonal of local correlation matrix should be zero
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+def test_local_correlation_diagonal_zero():
+    """GPU local correlation matrix diagonal should be zero."""
+    adata, genes = _make_test_data(n_cells=100, n_genes=10)
+    hs = Hotspot(
+        adata, model="normal", latent_obsm_key="latent",
+        umi_counts_obs_key="umi_counts", use_gpu=True,
+    )
+    hs.create_knn_graph(False, n_neighbors=30)
+    hs.compute_autocorrelations()
+    lcz = hs.compute_local_correlations(genes)
+
+    np.testing.assert_array_equal(np.diag(lcz.values), np.zeros(len(genes)))
+    np.testing.assert_array_equal(np.diag(hs.local_correlation_c.values), np.zeros(len(genes)))
+
+
+# ---------------------------------------------------------------------------
+# Error handling
+# ---------------------------------------------------------------------------
+
+
+def test_use_gpu_without_gpu():
+    """Setting use_gpu=True without GPU raises RuntimeError."""
+    if GPU_AVAILABLE:
+        pytest.skip("GPU is available, cannot test missing-GPU error")
+
+    adata, genes = _make_test_data(n_cells=50, n_genes=5)
+    with pytest.raises(RuntimeError, match="GPU is not available"):
+        Hotspot(
+            adata, model="normal", latent_obsm_key="latent",
+            umi_counts_obs_key="umi_counts", use_gpu=True,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Benchmarks: quantify GPU speedup
+# ---------------------------------------------------------------------------
+
+
+@skip_no_gpu
+def test_benchmark_compute_autocorrelations():
+    """Benchmark GPU vs CPU speed for compute_autocorrelations."""
+    adata, genes = _make_test_data(n_cells=500, n_genes=100, seed=123)
+
+    # Warmup GPU
+    hs_w = Hotspot(adata, model="normal", latent_obsm_key="latent",
+                   umi_counts_obs_key="umi_counts", use_gpu=True)
+    hs_w.create_knn_graph(False, n_neighbors=30)
+    hs_w.compute_autocorrelations()
+
+    # Shared graph for fair comparison
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "normal")
+
+    t0 = time.perf_counter()
+    results_cpu = hs_cpu.compute_autocorrelations()
+    cpu_time = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    results_gpu = hs_gpu.compute_autocorrelations()
+    gpu_time = time.perf_counter() - t0
+
+    speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf")
+    print(f"\n--- compute_autocorrelations benchmark (500 cells, 100 genes) ---")
+    print(f"  CPU time: {cpu_time:.3f}s")
+    print(f"  GPU time: {gpu_time:.3f}s")
+    print(f"  Speedup:  {speedup:.1f}x")
+
+    results_cpu = results_cpu.loc[sorted(results_cpu.index)]
+    results_gpu = results_gpu.loc[sorted(results_gpu.index)]
+    np.testing.assert_allclose(
+        results_gpu["Z"].values, results_cpu["Z"].values, rtol=RTOL, atol=ATOL,
+    )
+
+
+@skip_no_gpu
+def test_benchmark_compute_local_correlations():
+    """Benchmark GPU vs CPU speed for compute_local_correlations."""
+    adata, genes = _make_test_data(n_cells=500, n_genes=50, seed=456)
+
+    # Warmup
+    hs_w = Hotspot(adata, model="normal", latent_obsm_key="latent",
+                   umi_counts_obs_key="umi_counts", use_gpu=True)
+    hs_w.create_knn_graph(False, n_neighbors=30)
+    hs_w.compute_autocorrelations()
+    hs_w.compute_local_correlations(genes)
+
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "normal")
+    hs_cpu.compute_autocorrelations()
+    hs_gpu.compute_autocorrelations()
+
+    t0 = time.perf_counter()
+    lcz_cpu = hs_cpu.compute_local_correlations(genes)
+    cpu_time = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    lcz_gpu = hs_gpu.compute_local_correlations(genes)
+    gpu_time = time.perf_counter() - t0
+
+    speedup = cpu_time / gpu_time if gpu_time > 0 else float("inf")
+    print(f"\n--- compute_local_correlations benchmark (500 cells, 50 genes) ---")
+    print(f"  CPU time: {cpu_time:.3f}s")
+    print(f"  GPU time: {gpu_time:.3f}s")
+    print(f"  Speedup:  {speedup:.1f}x")
+
+    np.testing.assert_allclose(lcz_gpu.values, lcz_cpu.values, rtol=RTOL, atol=ATOL)
+
+
+@skip_no_gpu
+def test_benchmark_large_scale():
+    """Benchmark at realistic scale: 1000 cells, 200 genes (danb model)."""
+    adata, genes = _make_test_data(n_cells=1000, n_genes=200, seed=789)
+
+    # Warmup
+    hs_w = Hotspot(adata, model="danb", latent_obsm_key="latent",
+                   umi_counts_obs_key="umi_counts", use_gpu=True)
+    hs_w.create_knn_graph(False, n_neighbors=30)
+    hs_w.compute_autocorrelations()
+
+    hs_cpu, hs_gpu = _make_cpu_gpu_pair(adata, "danb")
+
+    t0 = time.perf_counter()
+    results_cpu = hs_cpu.compute_autocorrelations()
+    cpu_auto_time = time.perf_counter() - t0
+
+    subset_genes = genes[:80]
+    t0 = time.perf_counter()
+    lcz_cpu = hs_cpu.compute_local_correlations(subset_genes)
+    cpu_lc_time = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    results_gpu = hs_gpu.compute_autocorrelations()
+    gpu_auto_time = time.perf_counter() - t0
+
+    t0 = time.perf_counter()
+    lcz_gpu = hs_gpu.compute_local_correlations(subset_genes)
+    gpu_lc_time = time.perf_counter() - t0
+
+    auto_speedup = cpu_auto_time / gpu_auto_time if gpu_auto_time > 0 else float("inf")
+    lc_speedup = cpu_lc_time / gpu_lc_time if gpu_lc_time > 0 else float("inf")
+
+    print(f"\n--- Large-scale benchmark (1000 cells, 200/80 genes, danb model) ---")
+    print(f"  compute_autocorrelations:")
+    print(f"    CPU: {cpu_auto_time:.3f}s | GPU: {gpu_auto_time:.3f}s | Speedup: {auto_speedup:.1f}x")
+    print(f"  compute_local_correlations (80 genes, {80*79//2} pairs):")
+    print(f"    CPU: {cpu_lc_time:.3f}s | GPU: {gpu_lc_time:.3f}s | Speedup: {lc_speedup:.1f}x")
+
+    results_cpu = results_cpu.loc[sorted(results_cpu.index)]
+    results_gpu = results_gpu.loc[sorted(results_gpu.index)]
+    np.testing.assert_allclose(
+        results_gpu["Z"].values, results_cpu["Z"].values, rtol=RTOL, atol=ATOL,
+    )
+    np.testing.assert_allclose(lcz_gpu.values, lcz_cpu.values, rtol=RTOL, atol=ATOL)