Merge branch 'main' into source_targets_enh

PauBadiaM · web-flow · commit c31ddc07bb26 · 2025-10-26T21:41:23.000-07:00
diff --git a/src/decoupler/_docs.py b/src/decoupler/_docs.py
@@ -194,6 +194,14 @@
 tval
     Whether to return the t-value (``tval=True``) the coefficient of the fitted model (``tval=False``)."""
 
+_n_bg = """\
+n_bg
+    Number indicating the background size."""
+
+_ha_corr = """\
+ha_corr
+    Haldane-Anscombe correction of odds ratio."""
+
 _params = f"""\
 Parameters
 ----------
@@ -252,4 +260,6 @@
     notest=_notest,
     returns=_returns,
     tval=_tval,
+    n_bg=_n_bg,
+    ha_corr=_ha_corr,
 )
diff --git a/src/decoupler/mt/__init__.py b/src/decoupler/mt/__init__.py
@@ -2,6 +2,7 @@
 from decoupler.mt._consensus import consensus
 from decoupler.mt._decouple import decouple
 from decoupler.mt._methods import _methods, aucell, gsea, gsva, mdt, mlm, ora, udt, ulm, viper, waggr, zscore
+from decoupler.mt._query_set import query_set
 
 
 def show() -> None:
diff --git a/src/decoupler/mt/_ora.py b/src/decoupler/mt/_ora.py
@@ -234,8 +234,8 @@ def _func_ora(
         If ``None``, the top 5% of positive features are selected.
     n_bm
         Number of bottom-ranked features, based on their magnitude, to select as observed features.
-    n_bg
-        Number indicating the background size.
+    %(n_bg)s
+    %(ha_corr)s
 
     %(returns)s
 
diff --git a/src/decoupler/mt/_query_set.py b/src/decoupler/mt/_query_set.py
@@ -0,0 +1,83 @@
+import pandas as pd
+import scipy.stats as sts
+from tqdm.auto import tqdm
+
+from decoupler._docs import docs
+from decoupler._log import _log
+from decoupler.mt._ora import _oddsr, _test1t
+from decoupler.pp.net import prune
+
+
+@docs.dedent
+def query_set(
+    features: list,
+    net: pd.DataFrame,
+    n_bg: int | float | None = 20_000,
+    ha_corr: int | float = 0.5,
+    tmin: int | float = 5,
+    verbose: bool = False,
+):
+    """
+    Test overlap between a given feature set against a database of sets.
+
+    Parameters
+    ----------
+    features
+        Set of features
+    %(net)s
+    %(n_bg)s
+    %(ha_corr)s
+    %(tmin)s
+    %(verbose)s
+
+    Returns
+    -------
+    Dataframe containing the odds ratio and fisher exact test p-values for the overlap of the given
+    features across sets in a network.
+
+    Example
+    -------
+    .. code-block:: python
+
+        import decoupler as dc
+
+        ct = dc.op.collectri()
+        ft = set(ct[ct["source"] == "SMAD4"]["target"])
+        dc.pp.query_set(features=fset, net=ct)
+    """
+    # Validate
+    assert hasattr(features, "__iter__") and not isinstance(features, str | bytes), (
+        "features must be an iterable collection of items such as a list"
+    )
+    features_set: set = set(features)
+    if n_bg is None:
+        n_bg = 0
+        m = "query_set - not using n_bg, a feature specific background will be used instead"
+        _log(m, level="info", verbose=verbose)
+    assert isinstance(n_bg, int | float) and n_bg >= 0, "n_bg must be numeric and positive"
+    # Prune
+    net = prune(features=None, net=net, tmin=tmin, verbose=verbose)
+    # Test each set against given set
+    sources = net["source"].unique()
+    df = []
+    for source in tqdm(sources, disable=not verbose):
+        targets = set(net[net["source"] == source]["target"])
+        set_a = features_set.intersection(targets)
+        set_b = targets.difference(features_set)
+        set_c = features_set.difference(targets)
+        a = len(set_a)
+        b = len(set_b)
+        c = len(set_c)
+        if n_bg == 0:
+            set_u = set_a.union(set_b).union(set_c)
+            set_d = set(net["target"]).difference(set_u)
+            d = len(set_d)
+        else:
+            d = int(n_bg - a - b - c)
+        od = _oddsr(a=a, b=b, c=c, d=d, ha_corr=ha_corr, log=True)
+        pv = _test1t(a=a, b=b, c=c, d=d)
+        df.append([source, od, pv])
+    df = pd.DataFrame(df, columns=["source", "stat", "pval"])
+    df["padj"] = sts.false_discovery_control(df["pval"], method="bh")
+    df = df.sort_values(["padj", "pval"]).reset_index(drop=True)
+    return df
diff --git a/src/decoupler/pp/net.py b/src/decoupler/pp/net.py
@@ -79,7 +79,7 @@ def _validate_net(
 
 @docs.dedent
 def prune(
-    features: np.ndarray,
+    features: np.ndarray | None,
     net: pd.DataFrame,
     tmin: int = 5,
     verbose: bool = False,
@@ -110,11 +110,12 @@ def prune(
     """
     # Validate
     vnet = _validate_net(net, verbose=verbose)
-    features_set = set(features)
     assert isinstance(tmin, int | float) and tmin >= 0, "tmin must be numeric and >= 0"
     # Find shared targets between mat and net
-    msk = vnet["target"].isin(features_set)
-    vnet = vnet.loc[msk]
+    if features is not None:
+        features_set = set(features)
+        msk = vnet["target"].isin(features_set)
+        vnet = vnet.loc[msk]
     # Find unique sources with tmin
     sources = vnet["source"].value_counts()
     sources = set(sources[sources >= tmin].index)
diff --git a/tests/mt/test_query_set.py b/tests/mt/test_query_set.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+import decoupler as dc
+
+
+def test_query_set(
+    net,
+):
+    ft = set(net[net["source"] == "T1"]["target"])
+    df = dc.mt.query_set(features=ft, net=net, tmin=0)
+    assert isinstance(df, pd.DataFrame)
+    cols = {"source", "stat", "pval", "padj"}
+    assert cols.issubset(df.columns)
+    df = dc.mt.query_set(features=ft, net=net, n_bg=None, tmin=0)