Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f99e5ac
Add permutation test
maltekuehl Mar 18, 2025
66f228e
Fix test kwargs update
maltekuehl Mar 18, 2025
74e3ec1
Add n_jobs argument and change test to check for significance agreeme…
maltekuehl Mar 18, 2025
4fdd140
Merge branch 'scverse:main' into main
maltekuehl Mar 18, 2025
64b7114
Simplify and generalize compare_groups by adding most important permu…
maltekuehl Mar 18, 2025
af7ca5d
Merge branch 'main' of https://github.com/complextissue/pertpy
maltekuehl Mar 18, 2025
325f1db
Make permutation_test argument optional but raise warning if not prov…
maltekuehl Mar 18, 2025
25d1689
Merge branch 'main' into main
maltekuehl Mar 18, 2025
1b6c65e
Merge branch 'main' of https://github.com/complextissue/pertpy
maltekuehl Mar 18, 2025
3e81976
Make test case a bit stricter again for significant values, enable re…
maltekuehl Mar 18, 2025
14736b3
Remove unnecessary import
maltekuehl Mar 18, 2025
5873b87
Remove parallelization and return statistic and p-value everywhere
maltekuehl Mar 19, 2025
676b4f0
Remove parallelization and return statistic and p-value everywhere
maltekuehl Mar 19, 2025
442b603
Remove parallelization and return statistic and p-value everywhere
maltekuehl Mar 19, 2025
8ae69ce
Fix docstring and examples of permutation test
maltekuehl Apr 7, 2025
ebc30fb
Merge remote-tracking branch 'origin/remote' into dev
maltekuehl Sep 11, 2025
1336ddb
Simplify permutation test with callable only
maltekuehl Sep 11, 2025
95d7da4
Default on user facing function only
maltekuehl Sep 12, 2025
e2c53fb
Undo last commit
maltekuehl Sep 12, 2025
412ab3b
Merge branch 'main' of https://github.com/complextissue/pertpy
maltekuehl Sep 12, 2025
52d2d58
Actually revert
maltekuehl Sep 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions docs/usage/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ Pertpy provides utilities to conduct differential gene expression tests through
tools.EdgeR
tools.WilcoxonTest
tools.TTest
tools.PermutationTest
tools.Statsmodels
```

Expand Down Expand Up @@ -563,33 +564,33 @@ including cell line annotation, bulk RNA and protein expression data.

Available databases for cell line metadata:

- [The Cancer Dependency Map Project at Broad](https://depmap.org/portal/)
- [The Cancer Dependency Map Project at Sanger](https://depmap.sanger.ac.uk/)
- [Genomics of Drug Sensitivity in Cancer (GDSC)](https://www.cancerrxgene.org/)
- [The Cancer Dependency Map Project at Broad](https://depmap.org/portal/)
- [The Cancer Dependency Map Project at Sanger](https://depmap.sanger.ac.uk/)
- [Genomics of Drug Sensitivity in Cancer (GDSC)](https://www.cancerrxgene.org/)

### Compound

The Compound module enables the retrieval of various types of information related to compounds of interest, including the most common synonym, pubchemID and canonical SMILES.

Available databases for compound metadata:

- [PubChem](https://pubchem.ncbi.nlm.nih.gov/)
- [PubChem](https://pubchem.ncbi.nlm.nih.gov/)

### Mechanism of Action

This module aims to retrieve metadata of mechanism of action studies related to perturbagens of interest, depending on the molecular targets.

Available databases for mechanism of action metadata:

- [CLUE](https://clue.io/)
- [CLUE](https://clue.io/)

### Drug

This module allows for the retrieval of Drug target information.

Available databases for drug metadata:

- [chembl](https://www.ebi.ac.uk/chembl/)
- [chembl](https://www.ebi.ac.uk/chembl/)

```{eval-rst}
.. autosummary::
Expand Down
2 changes: 2 additions & 0 deletions pertpy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self, *args, **kwargs):

DE_EXTRAS = ["formulaic", "pydeseq2"]
EdgeR = lazy_import("pertpy.tools._differential_gene_expression", "EdgeR", DE_EXTRAS) # edgeR will be imported via rpy2
PermutationTest = lazy_import("pertpy.tools._differential_gene_expression", "PermutationTest", DE_EXTRAS)
PyDESeq2 = lazy_import("pertpy.tools._differential_gene_expression", "PyDESeq2", DE_EXTRAS)
Statsmodels = lazy_import("pertpy.tools._differential_gene_expression", "Statsmodels", DE_EXTRAS + ["statsmodels"])
TTest = lazy_import("pertpy.tools._differential_gene_expression", "TTest", DE_EXTRAS)
Expand All @@ -62,6 +63,7 @@ def __init__(self, *args, **kwargs):
"PyDESeq2",
"WilcoxonTest",
"TTest",
"PermutationTest",
"Statsmodels",
"DistanceTest",
"Distance",
Expand Down
5 changes: 3 additions & 2 deletions pertpy/tools/_differential_gene_expression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from ._dge_comparison import DGEEVAL
from ._edger import EdgeR
from ._pydeseq2 import PyDESeq2
from ._simple_tests import SimpleComparisonBase, TTest, WilcoxonTest
from ._simple_tests import PermutationTest, SimpleComparisonBase, TTest, WilcoxonTest
from ._statsmodels import Statsmodels

__all__ = [
Expand All @@ -14,6 +14,7 @@
"SimpleComparisonBase",
"WilcoxonTest",
"TTest",
"PermutationTest",
]

AVAILABLE_METHODS = [Statsmodels, EdgeR, PyDESeq2, WilcoxonTest, TTest]
AVAILABLE_METHODS = [Statsmodels, EdgeR, PyDESeq2, WilcoxonTest, TTest, PermutationTest]
2 changes: 1 addition & 1 deletion pertpy/tools/_differential_gene_expression/_pydeseq2.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def fit(self, **kwargs) -> pd.DataFrame:
**kwargs: Keyword arguments specific to DeseqDataSet(), except for `n_cpus` which will use all available CPUs minus one if the argument is not passed.
"""
try:
usable_cpus = len(os.sched_getaffinity(0))
usable_cpus = len(os.sched_getaffinity(0)) # type: ignore # os.sched_getaffinity is not available on Windows and macOS
except AttributeError:
usable_cpus = os.cpu_count()

Expand Down
146 changes: 145 additions & 1 deletion pertpy/tools/_differential_gene_expression/_simple_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import scipy.stats
import statsmodels
from anndata import AnnData
from joblib import Parallel, delayed
from pandas.core.api import DataFrame as DataFrame
from scipy.sparse import diags, issparse
from tqdm.auto import tqdm
Expand Down Expand Up @@ -152,11 +153,154 @@ def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:


class TTest(SimpleComparisonBase):
"""Perform a unpaired or paired T-test"""
"""Perform a unpaired or paired T-test."""

@staticmethod
def _test(x0: np.ndarray, x1: np.ndarray, paired: bool, **kwargs) -> float:
if paired:
return scipy.stats.ttest_rel(x0, x1, **kwargs).pvalue
else:
return scipy.stats.ttest_ind(x0, x1, **kwargs).pvalue


class PermutationTest(SimpleComparisonBase):
"""Perform a permutation test.

The permutation test relies on another test (e.g. WilcoxonTest) to perform the actual comparison
based on permuted data. The p-value is then calculated based on the distribution of the test
statistic under the null hypothesis.

For paired tests, each paired observation is permuted together and distributed randoml between
the two groups. For unpaired tests, all observations are permuted independently.

The null hypothesis for the unpaired test is that all observations come from the same underlying
distribution and have been randomly assigned to one of the samples.

The null hypothesis for the paired permutation test is that the observations within each pair are
drawn from the same underlying distribution and that their assignment to a sample is random.
"""

@staticmethod
def _test(
x0: np.ndarray,
x1: np.ndarray,
paired: bool,
test: type["SimpleComparisonBase"] = WilcoxonTest,
n_permutations: int = 100,
seed: int = 0,
**kwargs,
) -> float:
"""Perform a permutation test.

Args:
x0: Array with baseline values.
x1: Array with values to compare.
paired: Indicates whether to perform a paired test
test: The test to use for the actual comparison.
n_permutations: Number of permutations to perform.
**kwargs: kwargs passed to the test function
"""

def call_test(x0, x1, **kwargs):
"""Perform the actual test."""
return test._test(x0, x1, paired, **kwargs)

if paired:
return scipy.stats.permutation_test(
[x0, x1],
statistic=call_test,
n_resamples=n_permutations,
permutation_type="samples",
rng=seed,
**kwargs,
).pvalue
else:
return scipy.stats.permutation_test(
[x0, x1],
statistic=call_test,
n_resamples=n_permutations,
permutation_type="independent",
rng=seed,
**kwargs,
).pvalue

@classmethod
def compare_groups(
cls,
adata: AnnData,
column: str,
baseline: str,
groups_to_compare: str | Sequence[str],
test: type["SimpleComparisonBase"] = WilcoxonTest,
n_permutations: int = 100,
n_jobs: int = -1,
seed: int = 0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think some parameters here like seed and n_jobs etc can be keyword only.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n_jobs is now a general parameter as to the suggestion of @grst, but seed is now just passed as rng in test_kwargs

*,
paired_by: str | None = None,
mask: str | None = None,
layer: str | None = None,
fit_kwargs: Mapping = MappingProxyType({}),
test_kwargs: Mapping = MappingProxyType({}),
) -> DataFrame:
"""Perform a comparison between groups using a permutation test.

Args:
adata: Annotated data object.
column: Column in `adata.obs` that contains the groups to compare.
baseline: Reference group.
groups_to_compare: Groups to compare against the baseline.
test: The test to use for the actual comparison after permutation. Default is TTest.
n_permutations: Number of permutations to perform.
n_jobs: Number of parallel jobs to use.
paired_by: Column in `adata.obs` to use for pairing.
mask: Mask to apply to the data.
layer: Layer to use for the comparison.
fit_kwargs: Additional kwargs passed to the test function.
test_kwargs: Additional kwargs passed to the test function.
"""
if len(fit_kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't get the point of this. Is it required? Can this be fixed upstream aka in the interface by making this optional to have?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is inherited from the base class and not a new introduction of this PR. MethodsBase is also used for linear models which require the fit_kwargs. I will change the docstring to indicate that these are not used for the simple tests.

warnings.warn("fit_kwargs not used for simple tests.", UserWarning, stacklevel=2)
paired = paired_by is not None
model = cls(adata, mask=mask, layer=layer)
if groups_to_compare is None:
# compare against all other
groups_to_compare = sorted(set(model.adata.obs[column]) - {baseline})
if isinstance(groups_to_compare, str):
groups_to_compare = [groups_to_compare]

def _get_idx(column, value):
mask = model.adata.obs[column] == value
if paired:
dummies = pd.get_dummies(model.adata.obs[paired_by], sparse=True).sparse.to_coo().tocsr()
if not np.all(np.sum(dummies, axis=0) == 2):
raise ValueError("Pairing is only possible with exactly two values per group")
# Use matrix multiplication to only retreive those dummy entries that are associated with the current `value`.
# Convert to COO matrix to get rows/cols
# row indices refers to the indices of rows that have `column == value` (equivalent to np.where(mask)[0])
# col indices refers to the numeric index of each "pair" in obs_names
ind_mat = diags(mask.values, dtype=bool) @ dummies
if not np.all(np.sum(ind_mat, axis=0) == 1):
raise ValueError("Pairing is only possible with exactly two values per group")
ind_mat = ind_mat.tocoo()
return ind_mat.row[np.argsort(ind_mat.col)]
else:
return np.where(mask)[0]

test_kwargs_mutable = dict(test_kwargs)
test_kwargs_mutable.update({"test": test, "n_permutations": n_permutations, "seed": seed})

res_dfs = []
baseline_idx = _get_idx(column, baseline)

comparison_indices = [_get_idx(column, group_to_compare) for group_to_compare in groups_to_compare]
res_dfs = Parallel(n_jobs=n_jobs)(
delayed(model._compare_single_group)(baseline_idx, comparison_idx, paired=paired, **test_kwargs_mutable)
for comparison_idx in comparison_indices
)
res_dfs = [
df.assign(
comparison=f"{group_to_compare}_vs_{baseline if baseline is not None else 'rest'}",
)
for df, group_to_compare in zip(res_dfs, groups_to_compare, strict=False)
]
return fdr_correction(pd.concat(res_dfs))
40 changes: 39 additions & 1 deletion tests/tools/_differential_gene_expression/test_simple_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd
import pytest
from pandas.core.api import DataFrame as DataFrame
from pertpy.tools._differential_gene_expression import SimpleComparisonBase, TTest, WilcoxonTest
from pertpy.tools._differential_gene_expression import PermutationTest, SimpleComparisonBase, TTest, WilcoxonTest


@pytest.mark.parametrize(
Expand Down Expand Up @@ -61,6 +61,44 @@ def test_t(test_adata_minimal, paired_by, expected):
assert actual[gene] == pytest.approx(expected[gene], abs=0.02)


@pytest.mark.parametrize(
"paired_by,expected",
[
pytest.param(
None,
{"gene1": {"p_value": 2.13e-26, "log_fc": -5.14}, "gene2": {"p_value": 0.96, "log_fc": -0.016}},
id="unpaired",
),
pytest.param(
"pairing",
{"gene1": {"p_value": 1.63e-26, "log_fc": -5.14}, "gene2": {"p_value": 0.85, "log_fc": -0.016}},
id="paired",
),
],
)
def test_permutation(test_adata_minimal, paired_by, expected):
"""Test that permutation test gives the correct values.

Reference values have been computed in R using wilcox.test
"""
for test in [TTest, WilcoxonTest]:
res_df = PermutationTest.compare_groups(
adata=test_adata_minimal,
column="condition",
baseline="A",
groups_to_compare="B",
paired_by=paired_by,
n_permutations=100,
test=test,
seed=0,
)
assert isinstance(res_df, DataFrame), "PermutationTest.compare_groups should return a DataFrame"
actual = res_df.loc[:, ["variable", "p_value", "log_fc"]].set_index("variable").to_dict(orient="index")
for gene in expected:
assert (expected[gene]["p_value"] < 0.05) == (actual[gene]["p_value"] < 0.05)
assert actual[gene]["log_fc"] == pytest.approx(expected[gene]["log_fc"], abs=0.02)


@pytest.mark.parametrize("seed", range(10))
def test_simple_comparison_pairing(test_adata_minimal, seed):
"""Test that paired samples are properly matched in a paired test"""
Expand Down
Loading