Simplify GPU code after review

justjhong · justjhong · commit 5595196aa0ec · 2026-03-15T22:15:41.000-04:00
- Unify _build_sparse_weight_matrix with square parameter
- Remove cp module parameter from GPU helpers (import locally)
- Skip all_mu/all_x2 allocation when centered=True
- Densify sparse counts once upfront instead of per-gene
- Extract _postprocess_results to share between CPU and GPU paths
- Fix pre-existing bug: none model branch in _compute_hs_pairs_inner
  fitted vals_x into mu_x instead of vals_y into mu_y
- Replace inline model dispatch in _compute_hs_pairs_inner with _fit_gene
diff --git a/hotspot/gpu.py b/hotspot/gpu.py
@@ -44,31 +44,18 @@ def _require_gpu():
         ) from e
 
 
-def _build_sparse_weight_matrix(neighbors, weights, shape):
+def _build_sparse_weight_matrix(neighbors, weights, shape, square=False):
     """Build a CuPy sparse CSR matrix from neighbor/weight arrays.
 
     W[i, neighbors[i,k]] = weights[i,k]  for all i, k where weights[i,k] != 0.
+    If square=True, uses weights^2 instead (for moment computations).
     """
     N, K = neighbors.shape
     rows = np.repeat(np.arange(N, dtype=np.int32), K)
     cols = neighbors.ravel().astype(np.int32)
     vals = weights.ravel().astype(np.float64)
-
-    mask = vals != 0
-    rows, cols, vals = rows[mask], cols[mask], vals[mask]
-
-    return cp_sparse.csr_matrix(
-        (cp.asarray(vals), (cp.asarray(rows), cp.asarray(cols))),
-        shape=shape,
-    )
-
-
-def _build_sparse_weight_sq_matrix(neighbors, weights, shape):
-    """Build sparse matrix with squared weights: W_sq[i,j] = weights[i,k]^2."""
-    N, K = neighbors.shape
-    rows = np.repeat(np.arange(N, dtype=np.int32), K)
-    cols = neighbors.ravel().astype(np.int32)
-    vals = (weights.ravel().astype(np.float64)) ** 2
+    if square:
+        vals = vals ** 2
 
     mask = vals != 0
     rows, cols, vals = rows[mask], cols[mask], vals[mask]
diff --git a/hotspot/local_stats.py b/hotspot/local_stats.py
@@ -187,9 +187,10 @@ def compute_hs(
 ):
 
     if use_gpu:
-        return _compute_hs_gpu(
+        results = _compute_hs_gpu(
             counts, neighbors, weights, num_umi, model, genes, centered
         )
+        return _postprocess_results(results)
 
     neighbors = neighbors.values
     weights = weights.values
@@ -232,14 +233,15 @@ def _map_fun(vals):
 
     results = pd.DataFrame(results, index=genes, columns=["G", "EG", "stdG", "Z", "C"])
 
+    return _postprocess_results(results)
+
+
+def _postprocess_results(results):
     results["Pval"] = norm.sf(results["Z"].values)
     results["FDR"] = multipletests(results["Pval"], method="fdr_bh")[1]
-
     results = results.sort_values("Z", ascending=False)
     results.index.name = "Gene"
-
-    results = results[["C", "Z", "Pval", "FDR"]]  # Remove other columns
-
+    results = results[["C", "Z", "Pval", "FDR"]]
     return results
 
 
@@ -312,7 +314,7 @@ def _local_cov_weights_gpu(vals_gpu, W):
     return (vals_gpu * smoothed_T.T).sum(axis=1)
 
 
-def _compute_moments_weights_gpu(cp, mu_gpu, x2_gpu, W, W_sq):
+def _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq):
     """GPU batch of compute_moments_weights for all genes at once."""
     # EG[g] = mu[g] . (W @ mu[g])
     EG = (mu_gpu * (W @ mu_gpu.T).T).sum(axis=1)
@@ -348,7 +350,7 @@ def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered)
     All genes are processed in parallel via sparse matrix multiplication.
     """
     import cupy as cp
-    from .gpu import _require_gpu, _build_sparse_weight_matrix, _build_sparse_weight_sq_matrix
+    from .gpu import _require_gpu, _build_sparse_weight_matrix
 
     _require_gpu()
 
@@ -362,22 +364,26 @@ def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered)
     D = compute_node_degree(neighbors_np, weights_np)
     Wtot2 = (weights_np ** 2).sum()
 
+    if issparse(counts):
+        counts_dense = counts.toarray()
+    else:
+        counts_dense = np.asarray(counts)
+
     all_vals = np.zeros((N_genes, N_cells), dtype="double")
-    all_mu = np.zeros((N_genes, N_cells), dtype="double")
-    all_x2 = np.zeros((N_genes, N_cells), dtype="double")
+    if not centered:
+        all_mu = np.zeros((N_genes, N_cells), dtype="double")
+        all_x2 = np.zeros((N_genes, N_cells), dtype="double")
 
     for i in range(N_genes):
-        raw = counts[i]
-        if issparse(raw):
-            raw = raw.toarray().ravel()
-        raw = np.asarray(raw).ravel().astype("double")
+        raw = counts_dense[i].astype("double")
 
         vals, mu, var, x2 = _fit_gene(raw, model, num_umi_np)
         if centered:
             vals = center_values(vals, mu, var)
+        else:
+            all_mu[i] = mu
+            all_x2[i] = x2
         all_vals[i] = vals
-        all_mu[i] = mu
-        all_x2[i] = x2
 
     vals_gpu = cp.asarray(all_vals)
     D_gpu = cp.asarray(D)
@@ -391,29 +397,21 @@ def _compute_hs_gpu(counts, neighbors, weights, num_umi, model, genes, centered)
     else:
         mu_gpu = cp.asarray(all_mu)
         x2_gpu = cp.asarray(all_x2)
-        W_sq = _build_sparse_weight_sq_matrix(
-            neighbors_np, weights_np, shape=(N_cells, N_cells)
+        W_sq = _build_sparse_weight_matrix(
+            neighbors_np, weights_np, shape=(N_cells, N_cells), square=True
         )
-        EG, EG2 = _compute_moments_weights_gpu(cp, mu_gpu, x2_gpu, W, W_sq)
+        EG, EG2 = _compute_moments_weights_gpu(mu_gpu, x2_gpu, W, W_sq)
 
     stdG = (EG2 - EG * EG) ** 0.5
     Z = (G_stats - EG) / stdG
 
     G_max = _compute_local_cov_max_gpu(D_gpu, vals_gpu)
     C = (G_stats - EG) / G_max
 
-    results = pd.DataFrame(
+    return pd.DataFrame(
         {
             "G": cp.asnumpy(G_stats), "EG": cp.asnumpy(EG),
             "stdG": cp.asnumpy(stdG), "Z": cp.asnumpy(Z), "C": cp.asnumpy(C),
         },
         index=genes,
     )
-
-    results["Pval"] = norm.sf(results["Z"].values)
-    results["FDR"] = multipletests(results["Pval"], method="fdr_bh")[1]
-    results = results.sort_values("Z", ascending=False)
-    results.index.name = "Gene"
-    results = results[["C", "Z", "Pval", "FDR"]]
-
-    return results
diff --git a/hotspot/local_stats_pairs.py b/hotspot/local_stats_pairs.py
@@ -9,7 +9,7 @@
 from . import bernoulli_model
 from . import normal_model
 from . import none_model
-from .local_stats import compute_local_cov_max
+from .local_stats import compute_local_cov_max, _fit_gene
 from .knn import compute_node_degree
 from .utils import center_values
 
@@ -449,25 +449,7 @@ def _compute_hs_pairs_inner(row_i, counts, neighbors, weights, num_umi,
     lc_out = np.zeros(counts.shape[0])
     lc_z_out = np.zeros(counts.shape[0])
 
-    if model == 'bernoulli':
-        vals_x = (vals_x > 0).astype('double')
-        mu_x, var_x, x2_x = bernoulli_model.fit_gene_model(
-            vals_x, num_umi)
-
-    elif model == 'danb':
-        mu_x, var_x, x2_x = danb_model.fit_gene_model(
-            vals_x, num_umi)
-
-    elif model == 'normal':
-        mu_x, var_x, x2_x = normal_model.fit_gene_model(
-            vals_x, num_umi)
-
-    elif model == 'none':
-        mu_x, var_x, x2_x = none_model.fit_gene_model(
-            vals_x, num_umi)
-
-    else:
-        raise Exception("Invalid Model: {}".format(model))
+    vals_x, mu_x, var_x, x2_x = _fit_gene(vals_x, model, num_umi)
 
     if centered:
         vals_x = center_values(vals_x, mu_x, var_x)
@@ -479,25 +461,7 @@ def _compute_hs_pairs_inner(row_i, counts, neighbors, weights, num_umi,
 
         vals_y = counts[row_j]
 
-        if model == 'bernoulli':
-            vals_y = (vals_y > 0).astype('double')
-            mu_y, var_y, x2_y = bernoulli_model.fit_gene_model(
-                vals_y, num_umi)
-
-        elif model == 'danb':
-            mu_y, var_y, x2_y = danb_model.fit_gene_model(
-                vals_y, num_umi)
-
-        elif model == 'normal':
-            mu_y, var_y, x2_y = normal_model.fit_gene_model(
-                vals_y, num_umi)
-
-        elif model == 'none':
-            mu_x, var_x, x2_x = none_model.fit_gene_model(
-                vals_x, num_umi)
-
-        else:
-            raise Exception("Invalid Model: {}".format(model))
+        vals_y, mu_y, var_y, x2_y = _fit_gene(vals_y, model, num_umi)
 
         if centered:
             vals_y = center_values(vals_y, mu_y, var_y)
@@ -889,12 +853,13 @@ def _conditional_eg2_gpu(X, W_sym):
     return (t1x_T ** 2).sum(axis=0)
 
 
-def _local_cov_pair_all_gpu(cp, X, W):
+def _local_cov_pair_all_gpu(X, W):
     """GPU batch of local_cov_pair for ALL gene pairs via dense matmul.
 
     Returns the full G x G matrix of lc values (= local_cov_pair * 2).
     Diagonal is zeroed (no self-pairs).
     """
+    import cupy as cp
     smoothed_T = W @ X.T                # (N, G)
     M = X @ smoothed_T                  # (G, G):  M[a,b] = x_a . (W @ x_b)
     lc_matrix = M + M.T                 # symmetrize: lc[a,b] = x_a.(Wx_b) + x_b.(Wx_a)
@@ -930,7 +895,7 @@ def _compute_hs_pairs_centered_cond_gpu(counts, neighbors, weights, num_umi, mod
 
     eg2s = _conditional_eg2_gpu(X, W_sym)
 
-    lc_matrix = _local_cov_pair_all_gpu(cp, X, W)
+    lc_matrix = _local_cov_pair_all_gpu(X, W)
 
     std_genes = eg2s ** 0.5
     Z_xy = lc_matrix / std_genes[:, None]