|
| 1 | +import pandas as pd |
| 2 | +import scipy.stats as sts |
| 3 | +from tqdm.auto import tqdm |
| 4 | + |
| 5 | +from decoupler._docs import docs |
| 6 | +from decoupler._log import _log |
| 7 | +from decoupler.mt._ora import _oddsr, _test1t |
| 8 | +from decoupler.pp.net import prune |
| 9 | + |
| 10 | + |
| 11 | +@docs.dedent |
| 12 | +def query_set( |
| 13 | + features: list, |
| 14 | + net: pd.DataFrame, |
| 15 | + n_bg: int | float | None = 20_000, |
| 16 | + ha_corr: int | float = 0.5, |
| 17 | + tmin: int | float = 5, |
| 18 | + verbose: bool = False, |
| 19 | +): |
| 20 | + """ |
| 21 | + Test overlap between a given feature set against a database of sets. |
| 22 | +
|
| 23 | + Parameters |
| 24 | + ---------- |
| 25 | + features |
| 26 | + Set of features |
| 27 | + %(net)s |
| 28 | + %(n_bg)s |
| 29 | + %(ha_corr)s |
| 30 | + %(tmin)s |
| 31 | + %(verbose)s |
| 32 | +
|
| 33 | + Returns |
| 34 | + ------- |
| 35 | + Dataframe containing the odds ratio and fisher exact test p-values for the overlap of the given |
| 36 | + features across sets in a network. |
| 37 | +
|
| 38 | + Example |
| 39 | + ------- |
| 40 | + .. code-block:: python |
| 41 | +
|
| 42 | + import decoupler as dc |
| 43 | +
|
| 44 | + ct = dc.op.collectri() |
| 45 | + ft = set(ct[ct["source"] == "SMAD4"]["target"]) |
| 46 | + dc.pp.query_set(features=fset, net=ct) |
| 47 | + """ |
| 48 | + # Validate |
| 49 | + assert hasattr(features, "__iter__") and not isinstance(features, str | bytes), ( |
| 50 | + "features must be an iterable collection of items such as a list" |
| 51 | + ) |
| 52 | + features_set: set = set(features) |
| 53 | + if n_bg is None: |
| 54 | + n_bg = 0 |
| 55 | + m = "query_set - not using n_bg, a feature specific background will be used instead" |
| 56 | + _log(m, level="info", verbose=verbose) |
| 57 | + assert isinstance(n_bg, int | float) and n_bg >= 0, "n_bg must be numeric and positive" |
| 58 | + # Prune |
| 59 | + net = prune(features=None, net=net, tmin=tmin, verbose=verbose) |
| 60 | + # Test each set against given set |
| 61 | + sources = net["source"].unique() |
| 62 | + df = [] |
| 63 | + for source in tqdm(sources, disable=not verbose): |
| 64 | + targets = set(net[net["source"] == source]["target"]) |
| 65 | + set_a = features_set.intersection(targets) |
| 66 | + set_b = targets.difference(features_set) |
| 67 | + set_c = features_set.difference(targets) |
| 68 | + a = len(set_a) |
| 69 | + b = len(set_b) |
| 70 | + c = len(set_c) |
| 71 | + if n_bg == 0: |
| 72 | + set_u = set_a.union(set_b).union(set_c) |
| 73 | + set_d = set(net["target"]).difference(set_u) |
| 74 | + d = len(set_d) |
| 75 | + else: |
| 76 | + d = int(n_bg - a - b - c) |
| 77 | + od = _oddsr(a=a, b=b, c=c, d=d, ha_corr=ha_corr, log=True) |
| 78 | + pv = _test1t(a=a, b=b, c=c, d=d) |
| 79 | + df.append([source, od, pv]) |
| 80 | + df = pd.DataFrame(df, columns=["source", "stat", "pval"]) |
| 81 | + df["padj"] = sts.false_discovery_control(df["pval"], method="bh") |
| 82 | + df = df.sort_values(["padj", "pval"]).reset_index(drop=True) |
| 83 | + return df |
0 commit comments