LISI implementation (#20)

justjhong · pre-commit-ci[bot] · adamgayoso · web-flow · commit 0940e937c931 · 2022-10-10T20:28:56.000-07:00
* simpson index compute * running lisi * fix bug with lisi and expose bug in silhouette test * add docs * flake8 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * adjust atol for tiny silhouette val differences * flake * address flax comment * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use nneighbors to convert graph * Update pyproject.toml * move dep to test * use chex * add ilisi and clisi * add to docs * add references in docs * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adam Gayoso <adamgayoso@users.noreply.github.com>
diff --git a/docs/api.md b/docs/api.md
@@ -14,6 +14,8 @@
     nmi_ari_cluster_labels_leiden
     silhouette_label
     silhouette_batch
+    ilisi_knn
+    clisi_knn
 ```
 
 ## Utils
diff --git a/docs/references.bib b/docs/references.bib
@@ -8,3 +8,18 @@ @article{luecken2022benchmarking
   year      = {2022},
   publisher = {Nature Publishing Group}
 }
+
+
+@article{korsunsky2019harmony,
+  title    = {Fast, sensitive and accurate integration of single-cell data with
+              Harmony},
+  author   = {Korsunsky, Ilya and Millard, Nghia and Fan, Jean and Slowikowski,
+              Kamil and Zhang, Fan and Wei, Kevin and Baglaenko, Yuriy and
+              Brenner, Michael and Loh, Po-Ru and Raychaudhuri, Soumya},
+  journal  = {Nat. Methods},
+  volume   = {16},
+  number   = {12},
+  pages    = {1289--1296},
+  month    = {dec},
+  year     = {2019},
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ urls.Source = "https://github.com/yoseflab/scib-metrics"
 urls.Home-page = "https://github.com/yoseflab/scib-metrics"
 dependencies = [
     "anndata",
+    "chex",
     "jax",
     "jaxlib",
     "numpy",
@@ -51,6 +52,7 @@ doc = [
 test = [
     "pytest",
     "pytest-cov",
+    "harmonypy",
     "joblib",
 ]
 parallel = [
diff --git a/src/scib_metrics/__init__.py b/src/scib_metrics/__init__.py
@@ -4,6 +4,7 @@
 from . import utils
 from ._ari_nmi import nmi_ari_cluster_labels_kmeans, nmi_ari_cluster_labels_leiden
 from ._isolated_labels import isolated_labels
+from ._lisi import clisi_knn, ilisi_knn, lisi_knn
 from ._settings import settings
 from ._silhouette import silhouette_batch, silhouette_label
 
@@ -12,6 +13,8 @@
     "isolated_labels",
     "silhouette_label",
     "silhouette_batch",
+    "ilisi_knn",
+    "clisi_knn",
     "nmi_ari_cluster_labels_kmeans",
     "nmi_ari_cluster_labels_leiden",
 ]
diff --git a/src/scib_metrics/_lisi.py b/src/scib_metrics/_lisi.py
@@ -0,0 +1,116 @@
+from typing import Tuple
+
+import numpy as np
+from scipy.sparse import csr_matrix
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_array
+
+from scib_metrics.utils import compute_simpson_index
+
+
+def _convert_knn_graph_to_idx(knn_graph: csr_matrix) -> Tuple[np.ndarray, np.ndarray]:
+    check_array(knn_graph, accept_sparse="csr")
+
+    n_neighbors = np.unique(knn_graph.nonzero()[0], return_counts=True)[1]
+    if len(np.unique(n_neighbors)) > 1:
+        raise ValueError("Each cell must have the same number of neighbors.")
+
+    n_neighbors = int(np.unique(n_neighbors)[0])
+
+    nn_obj = NearestNeighbors(n_neighbors=n_neighbors, metric="precomputed").fit(knn_graph)
+    return nn_obj.kneighbors(knn_graph)
+
+
+def lisi_knn(knn_graph: csr_matrix, labels: np.ndarray, perplexity: float = None) -> np.ndarray:
+    """Compute the local inverse simpson index (LISI) for each cell :cite:p:`korsunsky2019harmony`.
+
+    Parameters
+    ----------
+    knn_graph
+        Sparse array of shape (n_cells, n_cells) with non-zero values for
+        exactly each cell's k nearest neighbors.
+    labels
+        Array of shape (n_cells,) representing label values
+        for each cell.
+    perplexity
+        Parameter controlling effective neighborhood size. If None, the
+        perplexity is set to the number of neighbors // 3.
+
+    Returns
+    -------
+    lisi
+        Array of shape (n_cells,) with the LISI score for each cell.
+    """
+    knn_dists, knn_idx = _convert_knn_graph_to_idx(knn_graph)
+
+    if perplexity is None:
+        perplexity = np.floor(knn_idx.shape[1] / 3)
+
+    n_labels = len(np.unique(labels))
+
+    simpson = compute_simpson_index(knn_dists, knn_idx, labels, n_labels, perplexity=perplexity)
+    return 1 / simpson
+
+
+def ilisi_knn(knn_graph: csr_matrix, batches: np.ndarray, perplexity: float = None, scale: bool = True) -> np.ndarray:
+    """Compute the integration local inverse simpson index (iLISI) for each cell :cite:p:`korsunsky2019harmony`.
+
+    Returns a scaled version of the iLISI score for each cell, by default :cite:p:`luecken2022benchmarking`.
+
+    Parameters
+    ----------
+    knn_graph
+        Sparse array of shape (n_cells, n_cells) with non-zero values for
+        exactly each cell's k nearest neighbors.
+    batches
+        Array of shape (n_cells,) representing batch values
+        for each cell.
+    perplexity
+        Parameter controlling effective neighborhood size. If None, the
+        perplexity is set to the number of neighbors // 3.
+    scale
+        Scale lisi into the range [0, 1]. If True, higher values are better.
+
+    Returns
+    -------
+    ilisi
+        Array of shape (n_cells,) with the iLISI score for each cell.
+    """
+    lisi = lisi_knn(knn_graph, batches, perplexity=perplexity)
+    ilisi = np.nanmedian(lisi)
+    if scale:
+        nbatches = len(np.unique(batches))
+        ilisi = (ilisi - 1) / (nbatches - 1)
+    return ilisi
+
+
+def clisi_knn(knn_graph: csr_matrix, labels: np.ndarray, perplexity: float = None, scale: bool = True) -> np.ndarray:
+    """Compute the cell-type local inverse simpson index (cLISI) for each cell :cite:p:`korsunsky2019harmony`.
+
+    Returns a scaled version of the cLISI score for each cell, by default :cite:p:`luecken2022benchmarking`.
+
+    Parameters
+    ----------
+    knn_graph
+        Sparse array of shape (n_cells, n_cells) with non-zero values for
+        exactly each cell's k nearest neighbors.
+    labels
+        Array of shape (n_cells,) representing cell type label values
+        for each cell.
+    perplexity
+        Parameter controlling effective neighborhood size. If None, the
+        perplexity is set to the number of neighbors // 3.
+    scale
+        Scale lisi into the range [0, 1]. If True, higher values are better.
+
+    Returns
+    -------
+    clisi
+        Array of shape (n_cells,) with the cLISI score for each cell.
+    """
+    lisi = lisi_knn(knn_graph, labels, perplexity=perplexity)
+    clisi = np.nanmedian(lisi)
+    if scale:
+        nlabels = len(np.unique(labels))
+        clisi = (nlabels - clisi) / (nlabels - 1)
+    return clisi
diff --git a/src/scib_metrics/utils/__init__.py b/src/scib_metrics/utils/__init__.py
@@ -1,5 +1,6 @@
 from ._dist import cdist
 from ._kmeans import KMeansJax
+from ._lisi import compute_simpson_index
 from ._silhouette import silhouette_samples
 
-__all__ = ["silhouette_samples", "cdist", "KMeansJax"]
+__all__ = ["silhouette_samples", "cdist", "KMeansJax", "compute_simpson_index"]
diff --git a/src/scib_metrics/utils/_lisi.py b/src/scib_metrics/utils/_lisi.py
@@ -0,0 +1,122 @@
+from typing import Tuple, Union
+
+import chex
+import jax
+import jax.numpy as jnp
+import numpy as np
+
+NdArray = Union[np.ndarray, jnp.ndarray]
+
+
+@chex.dataclass
+class _NeighborProbabilityState:
+    H: float
+    P: chex.ArrayDevice
+    Hdiff: float
+    beta: float
+    betamin: float
+    betamax: float
+    tries: int
+
+
+@jax.jit
+def _Hbeta(knn_dists_row: jnp.ndarray, beta: float) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    P = jnp.exp(-knn_dists_row * beta)
+    sumP = jnp.nansum(P)
+    H = jnp.where(sumP == 0, 0, jnp.log(sumP) + beta * jnp.nansum(knn_dists_row * P) / sumP)
+    P = jnp.where(sumP == 0, jnp.zeros_like(knn_dists_row), P / sumP)
+    return H, P
+
+
+@jax.jit
+def _get_neighbor_probability(
+    knn_dists_row: jnp.ndarray, perplexity: float, tol: float
+) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    beta = 1
+    betamin = -jnp.inf
+    betamax = jnp.inf
+    H, P = _Hbeta(knn_dists_row, beta)
+    Hdiff = H - jnp.log(perplexity)
+
+    def _get_neighbor_probability_step(state):
+        Hdiff = state.Hdiff
+        beta = state.beta
+        betamin = state.betamin
+        betamax = state.betamax
+        tries = state.tries
+
+        new_betamin = jnp.where(Hdiff > 0, beta, betamin)
+        new_betamax = jnp.where(Hdiff > 0, betamax, beta)
+        new_beta = jnp.where(
+            Hdiff > 0,
+            jnp.where(betamax == jnp.inf, beta * 2, (beta + betamax) / 2),
+            jnp.where(betamin == -jnp.inf, beta / 2, (beta + betamin) / 2),
+        )
+        new_H, new_P = _Hbeta(knn_dists_row, new_beta)
+        new_Hdiff = new_H - jnp.log(perplexity)
+        return _NeighborProbabilityState(
+            H=new_H, P=new_P, Hdiff=new_Hdiff, beta=new_beta, betamin=new_betamin, betamax=new_betamax, tries=tries + 1
+        )
+
+    def _get_neighbor_probability_convergence(state):
+        Hdiff, tries = state.Hdiff, state.tries
+        return jnp.logical_and(jnp.abs(Hdiff) > tol, tries < 50)
+
+    init_state = _NeighborProbabilityState(H=H, P=P, Hdiff=Hdiff, beta=beta, betamin=betamin, betamax=betamax, tries=0)
+    final_state = jax.lax.while_loop(_get_neighbor_probability_convergence, _get_neighbor_probability_step, init_state)
+    return final_state.H, final_state.P
+
+
+def _compute_simpson_index_cell(
+    knn_dists_row: jnp.ndarray, knn_row: jnp.ndarray, labels: jnp.ndarray, n_batches: int, perplexity: float, tol: float
+) -> jnp.ndarray:
+    H, P = _get_neighbor_probability(knn_dists_row, perplexity, tol)
+
+    def _non_zero_H_simpson():
+        knn_labels = jnp.take(labels, knn_row)
+        L = jax.nn.one_hot(knn_labels, n_batches)
+        sumP = P @ L
+        return jnp.where(knn_labels.shape[0] == P.shape[0], jnp.dot(sumP, sumP), 1)
+
+    return jnp.where(H == 0, -1, _non_zero_H_simpson())
+
+
+def compute_simpson_index(
+    knn_dists: NdArray,
+    knn_idx: NdArray,
+    labels: NdArray,
+    n_labels: int,
+    perplexity: float = 30,
+    tol: float = 1e-5,
+) -> np.ndarray:
+    """Compute the Simpson index for each cell.
+
+    Parameters
+    ----------
+    knn_dists
+        KNN distances of size (n_cells, n_neighbors).
+    knn_idx
+        KNN indices of size (n_cells, n_neighbors) corresponding to distances.
+    labels
+        Cell labels of size (n_cells,).
+    n_labels
+        Number of labels.
+    perplexity
+        Measure of the effective number of neighbors.
+    tol
+        Tolerance for binary search.
+
+    Returns
+    -------
+    simpson_index
+        Simpson index of size (n_cells,).
+    """
+    knn_dists = jnp.array(knn_dists)
+    knn_idx = jnp.array(knn_idx)
+    labels = jnp.array(labels)
+    n = knn_dists.shape[0]
+    return jax.device_get(
+        jax.vmap(
+            lambda i: _compute_simpson_index_cell(knn_dists[i, :], knn_idx[i, :], labels, n_labels, perplexity, tol)
+        )(jnp.arange(n))
+    )
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -1,22 +1,31 @@
+import sys
+
 import jax.numpy as jnp
 import numpy as np
+import pandas as pd
+from harmonypy import compute_lisi as harmonypy_lisi
+from scipy.sparse import csr_matrix
 from scipy.spatial.distance import cdist as sp_cdist
 from sklearn.metrics import silhouette_samples as sk_silhouette_samples
+from sklearn.neighbors import NearestNeighbors
 
 import scib_metrics
 
+sys.path.append("../src/")
+
 
 def dummy_x_labels(return_symmetric_positive=False):
-    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
+    np.random.seed(1)
+    X = np.random.normal(size=(100, 10))
+    labels = np.random.randint(0, 2, size=(100,))
     if return_symmetric_positive:
         X = np.abs(X @ X.T)
-    labels = np.array([0, 0, 1, 1, 0, 1])
     return X, labels
 
 
 def dummy_x_labels_batch():
     X, labels = dummy_x_labels()
-    batch = np.array([0, 1, 0, 1, 0, 1])
+    batch = np.random.randint(0, 2, size=(100,))
     return X, labels, batch
 
 
@@ -32,7 +41,7 @@ def test_cdist():
 
 def test_silhouette_samples():
     X, labels = dummy_x_labels()
-    assert np.allclose(scib_metrics.utils.silhouette_samples(X, labels), sk_silhouette_samples(X, labels))
+    assert np.allclose(scib_metrics.utils.silhouette_samples(X, labels), sk_silhouette_samples(X, labels), atol=1e-5)
 
 
 def test_silhouette_label():
@@ -49,6 +58,39 @@ def test_silhouette_batch():
     scib_metrics.silhouette_batch(X, labels, batch)
 
 
+def test_compute_simpson_index():
+    X, labels = dummy_x_labels()
+    D = scib_metrics.utils.cdist(X, X)
+    nbrs = NearestNeighbors(n_neighbors=30, algorithm="kd_tree").fit(X)
+    D, knn_idx = nbrs.kneighbors(X)
+    scib_metrics.utils.compute_simpson_index(
+        jnp.array(D), jnp.array(knn_idx), jnp.array(labels), len(np.unique(labels))
+    )
+
+
+def test_lisi_knn():
+    X, labels = dummy_x_labels()
+    dist_mat = csr_matrix(scib_metrics.utils.cdist(X, X))
+    nbrs = NearestNeighbors(n_neighbors=30, algorithm="kd_tree").fit(X)
+    knn_graph = nbrs.kneighbors_graph(X)
+    knn_graph = knn_graph.multiply(dist_mat)
+    lisi_res = scib_metrics.lisi_knn(knn_graph, labels, perplexity=10)
+    harmonypy_lisi_res = harmonypy_lisi(
+        X, pd.DataFrame(labels, columns=["labels"]), label_colnames=["labels"], perplexity=10
+    )[:, 0]
+    assert np.allclose(lisi_res, harmonypy_lisi_res)
+
+
+def test_ilisi_clisi_knn():
+    X, labels, batches = dummy_x_labels_batch()
+    dist_mat = csr_matrix(scib_metrics.utils.cdist(X, X))
+    nbrs = NearestNeighbors(n_neighbors=30, algorithm="kd_tree").fit(X)
+    knn_graph = nbrs.kneighbors_graph(X)
+    knn_graph = knn_graph.multiply(dist_mat)
+    scib_metrics.ilisi_knn(knn_graph, batches, perplexity=10)
+    scib_metrics.clisi_knn(knn_graph, labels, perplexity=10)
+
+
 def test_nmi_ari_cluster_labels_kmeans():
     X, labels = dummy_x_labels()
     nmi, ari = scib_metrics.nmi_ari_cluster_labels_kmeans(X, labels)