diff --git a/src/decoupler/_docs.py b/src/decoupler/_docs.py index 791dd09..525b228 100644 --- a/src/decoupler/_docs.py +++ b/src/decoupler/_docs.py @@ -194,6 +194,14 @@ tval Whether to return the t-value (``tval=True``) the coefficient of the fitted model (``tval=False``).""" +_n_bg = """\ +n_bg + Number indicating the background size.""" + +_ha_corr = """\ +ha_corr + Haldane-Anscombe correction of odds ratio.""" + _params = f"""\ Parameters ---------- @@ -252,4 +260,6 @@ notest=_notest, returns=_returns, tval=_tval, + n_bg=_n_bg, + ha_corr=_ha_corr, ) diff --git a/src/decoupler/mt/__init__.py b/src/decoupler/mt/__init__.py index e5da4c7..5d2ea45 100644 --- a/src/decoupler/mt/__init__.py +++ b/src/decoupler/mt/__init__.py @@ -2,6 +2,7 @@ from decoupler.mt._consensus import consensus from decoupler.mt._decouple import decouple from decoupler.mt._methods import _methods, aucell, gsea, gsva, mdt, mlm, ora, udt, ulm, viper, waggr, zscore +from decoupler.mt._query_set import query_set def show() -> None: diff --git a/src/decoupler/mt/_ora.py b/src/decoupler/mt/_ora.py index a405f01..58dc93b 100644 --- a/src/decoupler/mt/_ora.py +++ b/src/decoupler/mt/_ora.py @@ -234,8 +234,8 @@ def _func_ora( If ``None``, the top 5% of positive features are selected. n_bm Number of bottom-ranked features, based on their magnitude, to select as observed features. - n_bg - Number indicating the background size. + %(n_bg)s + %(ha_corr)s %(returns)s diff --git a/src/decoupler/mt/_query_set.py b/src/decoupler/mt/_query_set.py new file mode 100644 index 0000000..03b5a73 --- /dev/null +++ b/src/decoupler/mt/_query_set.py @@ -0,0 +1,83 @@ +import pandas as pd +import scipy.stats as sts +from tqdm.auto import tqdm + +from decoupler._docs import docs +from decoupler._log import _log +from decoupler.mt._ora import _oddsr, _test1t +from decoupler.pp.net import prune + + +@docs.dedent +def query_set( + features: list, + net: pd.DataFrame, + n_bg: int | float | None = 20_000, + ha_corr: int | float = 0.5, + tmin: int | float = 5, + verbose: bool = False, +): + """ + Test overlap between a given feature set against a database of sets. + + Parameters + ---------- + features + Set of features + %(net)s + %(n_bg)s + %(ha_corr)s + %(tmin)s + %(verbose)s + + Returns + ------- + Dataframe containing the odds ratio and fisher exact test p-values for the overlap of the given + features across sets in a network. + + Example + ------- + .. code-block:: python + + import decoupler as dc + + ct = dc.op.collectri() + ft = set(ct[ct["source"] == "SMAD4"]["target"]) + dc.pp.query_set(features=fset, net=ct) + """ + # Validate + assert hasattr(features, "__iter__") and not isinstance(features, str | bytes), ( + "features must be an iterable collection of items such as a list" + ) + features_set: set = set(features) + if n_bg is None: + n_bg = 0 + m = "query_set - not using n_bg, a feature specific background will be used instead" + _log(m, level="info", verbose=verbose) + assert isinstance(n_bg, int | float) and n_bg >= 0, "n_bg must be numeric and positive" + # Prune + net = prune(features=None, net=net, tmin=tmin, verbose=verbose) + # Test each set against given set + sources = net["source"].unique() + df = [] + for source in tqdm(sources, disable=not verbose): + targets = set(net[net["source"] == source]["target"]) + set_a = features_set.intersection(targets) + set_b = targets.difference(features_set) + set_c = features_set.difference(targets) + a = len(set_a) + b = len(set_b) + c = len(set_c) + if n_bg == 0: + set_u = set_a.union(set_b).union(set_c) + set_d = set(net["target"]).difference(set_u) + d = len(set_d) + else: + d = int(n_bg - a - b - c) + od = _oddsr(a=a, b=b, c=c, d=d, ha_corr=ha_corr, log=True) + pv = _test1t(a=a, b=b, c=c, d=d) + df.append([source, od, pv]) + df = pd.DataFrame(df, columns=["source", "stat", "pval"]) + df["padj"] = sts.false_discovery_control(df["pval"], method="bh") + df = df.sort_values(["padj", "pval"]).reset_index(drop=True) + return df diff --git a/src/decoupler/pp/net.py b/src/decoupler/pp/net.py index 1ff4b8e..0937112 100644 --- a/src/decoupler/pp/net.py +++ b/src/decoupler/pp/net.py @@ -79,7 +79,7 @@ def _validate_net( @docs.dedent def prune( - features: np.ndarray, + features: np.ndarray | None, net: pd.DataFrame, tmin: int = 5, verbose: bool = False, @@ -110,11 +110,12 @@ def prune( """ # Validate vnet = _validate_net(net, verbose=verbose) - features_set = set(features) assert isinstance(tmin, int | float) and tmin >= 0, "tmin must be numeric and >= 0" # Find shared targets between mat and net - msk = vnet["target"].isin(features_set) - vnet = vnet.loc[msk] + if features is not None: + features_set = set(features) + msk = vnet["target"].isin(features_set) + vnet = vnet.loc[msk] # Find unique sources with tmin sources = vnet["source"].value_counts() sources = set(sources[sources >= tmin].index) diff --git a/tests/mt/test_query_set.py b/tests/mt/test_query_set.py new file mode 100644 index 0000000..cb99691 --- /dev/null +++ b/tests/mt/test_query_set.py @@ -0,0 +1,14 @@ +import pandas as pd + +import decoupler as dc + + +def test_query_set( + net, +): + ft = set(net[net["source"] == "T1"]["target"]) + df = dc.mt.query_set(features=ft, net=net, tmin=0) + assert isinstance(df, pd.DataFrame) + cols = {"source", "stat", "pval", "padj"} + assert cols.issubset(df.columns) + df = dc.mt.query_set(features=ft, net=net, n_bg=None, tmin=0)