Merge pull request #21 from icbi-lab/copykat

grst · web-flow · commit b86c81256d95 · 2021-09-13T13:54:17.000+02:00
copyKAT function
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,37 +8,62 @@ on:
 
 jobs:
   test:
-    runs-on: ${{ matrix.os }}
+    runs-on: ${{ matrix.os.os }}
+    name: ${{ matrix.os.os }} (R=${{ matrix.R }}, Python=${{ matrix.python-version }})
     strategy:
       fail-fast: false
       matrix:
         python-version: [3.8]
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        R: ['release']
+        os:
+         - {os: ubuntu-latest, rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
+         - {os: macos-latest}
+         - {os: windows-latest}
+
+    env:
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+      RSPM: ${{ matrix.os.rspm }}
 
     steps:
       - uses: actions/checkout@v2
         with:
           fetch-depth: 0 # required for setuptools-scm
+
       - uses: actions/cache@v1
         with:
           path: ~/.cache/pip
-          key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
+          key: ${{ runner.os.os }}-pip-${{ hashFiles('pyproject.toml') }}
           restore-keys: |
-            ${{ runner.os }}-pip-
+            ${{ runner.os.os }}-pip-
+
       - name: Install macOS system dependencies
         if: matrix.os == 'macos-latest'
         run: |
           brew install cairo pkg-config autoconf automake libtool
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v1
         with:
           python-version: ${{ matrix.python-version }}
+
+      - name: Setup R
+        uses: r-lib/actions/setup-r@v1
+        with:
+          r-version: ${{ matrix.R }}
+
       - name: Install dependencies
         run: |
-          pip install .[test]
+          pip install .[test,copykat]
+
+      - name: Install R dependencies
+        run: |
+          Rscript -e "install.packages('remotes')" -e "remotes::install_github('navinlabcode/copykat')"
+
       - name: Check black formatting
         run: |
           black --check .
+
       - name: Test with pytest
         run: |
           pytest
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+test_copykat*
 *.code-workspace
 .vscode/*
 !.vscode/settings.json.default
diff --git a/README.rst b/README.rst
@@ -29,10 +29,10 @@ but plays nicely with `scanpy <https://scanpy.readthedocs.io/en/stable/index.htm
 .. image:: img/infercnv_heatmap.png
     :align: center
     :alt: The main result of infercnv
-    
-    
+
+
 **WARNING**:
-    
+
 **This package is still experimental. The results have not been validated,
 except in that they look similar, but not identical, to the results of InferCNV.**
 
@@ -84,6 +84,19 @@ There are several alternative options to install infercnvpy:
 .. where `tag` is one of `these tags <https://quay.io/repository/biocontainers/infercnvpy?tab=tags>`_.
 
 
+To (optionally) run the :code:`copyKAT` algorithm, you need a working R installation
+and the `copykat <https://github.com/navinlabcode/copykat#step-1-installation>`_ package
+installed. Usually, if :code:`R` is in your :code:`PATH`, `rpy2 <https://rpy2.github.io/>`_ automatically
+detects your R installation. If you get an error message while importing :code:`infercnvpy`,
+try setting the :code:`R_HOME` environment variable before importing infercnvpy:
+
+.. code-block:: python
+
+   import os
+   os.environ["R_HOME"] = "/usr/lib/R"
+   import infercnvpy
+
+
 Release notes
 ^^^^^^^^^^^^^
 See the `release section <https://github.com/icbi-lab/infercnvpy/releases>`_.
diff --git a/docs/api.rst b/docs/api.rst
@@ -55,6 +55,7 @@ InferCNV
    :toctree: ./generated
 
    infercnv
+   copykat
    cnv_score
 
 Embeddings
diff --git a/docs/references.bib b/docs/references.bib
@@ -23,3 +23,15 @@ @article{Tirosh2016
   month   = {11}
 }
 
+@article{Gao2021,
+  doi       = {10.1038/s41587-020-00795-2},
+  url       = {https://doi.org/10.1038/s41587-020-00795-2},
+  year      = {2021},
+  month     = jan,
+  publisher = {Nature},
+  volume    = {39},
+  pages     = {599--608},
+  author    = {Gao R.  and Bai S.  and Henderson YC  and Lin Y.  and Schalck A.  and Yan Y.  and Kumar T.  and Hu M.  and Sei E.  and Davis A.  and Wang F.  and Shaitelman SF  and Wang JR  and Chen K.  and Moulder S.  and Lai SY  and Navin NE},
+  title     = {Delineating copy number and clonal substructure in human tumors from single-cell transcriptomes},
+  journal   = {Nature Biotechnology}
+}
diff --git a/infercnvpy/tests/test_tools.py b/infercnvpy/tests/test_tools.py
@@ -3,6 +3,7 @@
 from infercnvpy.tl._infercnv import _get_reference
 import pytest
 import numpy as np
+import scanpy as sc
 import numpy.testing as npt
 
 
@@ -50,6 +51,11 @@ def test_infercnv(adata_oligodendroma, reference_key, reference_cat):
     )
 
 
+def test_copykat(adata_oligodendroma):
+    sc.pp.subsample(adata_oligodendroma, n_obs=50)
+    cnv.tl.copykat(adata_oligodendroma)
+
+
 def test_workflow(adata_oligodendroma):
     cnv.tl.infercnv(adata_oligodendroma)
     cnv.tl.pca(adata_oligodendroma)
diff --git a/infercnvpy/tl/__init__.py b/infercnvpy/tl/__init__.py
@@ -1,5 +1,6 @@
 from typing import Union
 from ._infercnv import infercnv, cnv_score
+from ._copykat import copykat
 import numpy as np
 from anndata import AnnData
 import scanpy as sc
diff --git a/infercnvpy/tl/_copykat.py b/infercnvpy/tl/_copykat.py
@@ -0,0 +1,148 @@
+from typing import Optional
+import pandas as pd
+from scipy.sparse import issparse
+from anndata import AnnData
+from scanpy import logging
+import os
+from multiprocessing import cpu_count
+
+
+def copykat(
+    adata: AnnData,
+    gene_ids: str = "S",
+    segmentation_cut: float = 0.1,
+    distance: str = "euclidean",
+    s_name: str = "copykat_result",
+    min_genes_chr: int = 5,
+    key_added: str = "cnv",
+    inplace: bool = True,
+    layer: str = None,
+    n_jobs: Optional[int] = None,
+) -> pd.DataFrame:
+    """Inference of genomic copy number and subclonal structure.
+
+    Runs CopyKAT (Copynumber Karyotyping of Tumors) :cite:`Gao2021` based on integrative
+    Bayesian approaches to identify genome-wide aneuploidy at 5MB resolution
+    in single cells to separate tumor cells from normal cells, and tumor
+    subclones using high-throughput sc-RNAseq data.
+
+    Note on input data from the original authors:
+
+        The matrix values are often the count of unique molecular identifier (UMI)
+        from nowadays high througput single cell RNAseq data. The early generation of
+        scRNAseq data may be summarized as TPM values or total read counts,
+        which should also work.
+
+    This means that unlike for :func:`infercnvpy.tl.infercnv` the input data
+    should not be log-transformed.
+
+    CopyKAT also does NOT require running :func:`infercnvpy.io.genomic_position_from_gtf`,
+    it infers the genomic position from the gene symbols in `adata.var_names`.
+
+    You can find more info on GitHub: https://github.com/navinlabcode/copykat
+
+    Parameters
+    ----------
+    adata
+        annotated data matrix
+    key_added
+        Key under which the copyKAT scores will be stored in `adata.obsm` and `adata.uns`.
+    inplace
+        If True, store the result in adata, otherwise return it.
+    gene_ids
+        gene id type: Symbol ("S") or Ensemble ("E").
+    segmentation_cut
+        segmentation parameters, input 0 to 1; larger looser criteria.
+    distance
+        distance methods include "euclidean", and correlation coverted distance include "pearson" and "spearman".
+    s_name
+        sample (output file) name.
+    min_genes_chr
+        minimal number of genes per chromosome for cell filtering.
+    n_jobs
+        Number of cores to use for copyKAT analysis. Per default, uses all cores
+        available on the system. Multithreading does not work on Windows and this
+        value will be ignored.
+
+    Returns
+    -------
+    Depending on the value of `inplace`, either returns `None` or a vector
+    with scores.
+    """
+
+    if n_jobs is None:
+        n_jobs = cpu_count()
+    if os.name != "posix":
+        n_jobs = 1
+
+    try:
+        from rpy2.robjects.packages import importr
+        from rpy2.robjects import pandas2ri, numpy2ri
+        from rpy2.robjects.conversion import localconverter
+        from rpy2 import robjects as ro
+    except ImportError:
+        raise ImportError("copyKAT requires rpy2 to be installed. ")
+
+    try:
+        copyKAT = importr("copykat")
+    except ImportError:
+        raise ImportError(
+            "copyKAT requires a valid R installation with the following packages: "
+            "copykat"
+        )
+
+    logging.info("Preparing R objects")
+    with localconverter(ro.default_converter + numpy2ri.converter):
+        expr = adata.X if layer is None else tmp_adata.layers[layer]
+        if issparse(expr):
+            expr = expr.T.toarray()
+        else:
+            expr = expr.T
+        ro.globalenv["expr_r"] = ro.conversion.py2rpy(expr)
+    ro.globalenv["gene_names"] = ro.conversion.py2rpy(list(adata.var.index))
+    ro.globalenv["cell_IDs"] = ro.conversion.py2rpy(list(adata.obs.index))
+    ro.globalenv["n_jobs"] = ro.conversion.py2rpy(n_jobs)
+    ro.globalenv["gene_ids"] = ro.conversion.py2rpy(gene_ids)
+    ro.globalenv["segmentation_cut"] = ro.conversion.py2rpy(segmentation_cut)
+    ro.globalenv["distance"] = ro.conversion.py2rpy(distance)
+    ro.globalenv["s_name"] = ro.conversion.py2rpy(s_name)
+    ro.globalenv["min_gene_chr"] = ro.conversion.py2rpy(min_genes_chr)
+
+    logging.info("Running copyKAT")
+    ro.r(
+        f"""
+        rownames(expr_r) <- gene_names
+        colnames(expr_r) <- cell_IDs
+        copyKAT_run <- copykat(rawmat = expr_r, id.type = gene_ids, ngene.chr = min_gene_chr, win.size = 25, 
+                                KS.cut = segmentation_cut, sam.name = s_name, distance = distance, norm.cell.names = "", 
+                                n.cores = n_jobs, output.seg = FALSE)
+        copyKAT_result <- copyKAT_run$CNAmat
+        colnames(copyKAT_result) <- c('chrom', 'chrompos', 'abspos', cell_IDs)
+        """
+    )
+
+    with localconverter(
+        ro.default_converter + numpy2ri.converter + pandas2ri.converter
+    ):
+        copyKAT_result = ro.conversion.rpy2py(ro.globalenv["copyKAT_result"])
+
+    chrom_pos = {
+        "chr_pos": {
+            f"chr{chrom}": int(pos)
+            for pos, chrom in copyKAT_result.loc[:, ["chrom"]]
+            .drop_duplicates()
+            .itertuples()
+        }
+    }
+
+    # Drop cols
+    new_cpkat = copyKAT_result.drop(["chrom", "chrompos", "abspos"], axis=1).values
+
+    # transpose
+    new_cpkat_trans = new_cpkat.T
+
+    if inplace:
+        adata.uns[key_added] = chrom_pos
+        adata.obsm["X_%s" % key_added] = new_cpkat_trans
+    else:
+        return new_cpkat_trans
diff --git a/pyproject.toml b/pyproject.toml
@@ -36,9 +36,13 @@ requires = [
 ]
 
 [tool.flit.metadata.requires-extra]
+copykat = [
+    'rpy2'
+]
 test = [
     'pytest',
-    'black'
+    'black',
+    'pre-commit',
 ]
 doc = [
     'sphinx>=3.0.1,<3.1',

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+test_copykat*`
`1`	`2`	`*.code-workspace`
`2`	`3`	`.vscode/*`
`3`	`4`	`!.vscode/settings.json.default`
Original file line number	Diff line number	Diff line change
`@@ -36,9 +36,13 @@ requires = [`
`36`	`36`	`]`
`37`	`37`
`38`	`38`	`[tool.flit.metadata.requires-extra]`
	`39`	`+copykat = [`
	`40`	`+ 'rpy2'`
	`41`	`+]`
`39`	`42`	`test = [`
`40`	`43`	`'pytest',`
`41`		`- 'black'`
	`44`	`+ 'black',`
	`45`	`+ 'pre-commit',`
`42`	`46`	`]`
`43`	`47`	`doc = [`
`44`	`48`	`'sphinx>=3.0.1,<3.1',`