Merge pull request #145 from smuellerd/main

PauBadiaM · web-flow · commit c1a458f0cf01 · 2024-07-24T12:44:59.000+02:00
implement z-score method
diff --git a/decoupler/method_zscore.py b/decoupler/method_zscore.py
@@ -0,0 +1,88 @@
+"""
+Method zscore.
+Code to run the z-score (RoKAI, KSEA) method.
+"""
+
+import numpy as np
+import pandas as pd
+
+from scipy.stats import norm
+
+from .pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
+
+
+def zscore(m, net, flavor='RoKAI', verbose=False):
+    stds = np.std(m, axis=1, ddof=1)
+    if flavor != 'RoKAI':
+        mean_all = np.mean(m, axis=1)
+    else:
+        mean_all = np.zeros(stds.shape)
+    n = np.sqrt(np.count_nonzero(net, axis=0))
+    mean = m.dot(net) / np.sum(np.abs(net), axis=0)
+    es = ((mean - mean_all.reshape(-1, 1)) * n) / stds.reshape(-1, 1)
+    pv = norm.cdf(-np.abs(es))
+    return es, pv
+
+
+def run_zscore(mat, net, source='source', target='target', weight='weight', batch_size=10000, flavor='RoKAI',
+             min_n=5, verbose=False, use_raw=True):
+    """
+    z-score.
+
+    Calculates regulatory activities using a z-score as descibed in KSEA or RoKAI. The z-score calculates the mean of the molecular features of the 
+    known targets for each regulator and adjusts it for the number of identified targets for the regulator, the standard deviation of all molecular 
+    features (RoKAI), as well as the mean of all moleculare features (KSEA).
+
+    Parameters
+    ----------
+    mat : list, DataFrame or AnnData
+        List of [features, matrix], dataframe (samples x features) or an AnnData instance.
+    net : DataFrame
+        Network in long format.
+    source : str
+        Column name in net with source nodes.
+    target : str
+        Column name in net with target nodes.
+    weight : str
+        Column name in net with weights.
+    batch_size : int
+        Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
+    min_n : int
+        Minimum of targets per source. If less, sources are removed.
+    verbose : bool
+        Whether to show progress.
+    use_raw : bool
+        Use raw attribute of mat if present.
+
+    Returns
+    -------
+    estimate : DataFrame
+        Z-scores. Stored in `.obsm['zscore_estimate']` if `mat` is AnnData.
+    pvals : DataFrame
+        Obtained p-values. Stored in `.obsm['zscore_pvals']` if `mat` is AnnData.
+    """
+
+    # Extract sparse matrix and array of genes
+    m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
+
+    # Transform net
+    net = rename_net(net, source=source, target=target, weight=weight)
+    net = filt_min_n(c, net, min_n=min_n)
+    sources, targets, net = get_net_mat(net)
+
+    # Match arrays
+    net = match(c, targets, net)
+
+    if verbose:
+        print('Running zscore on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
+
+    # Run ULM
+    estimate, pvals = zscore(m, net, flavor=flavor)
+
+    # Transform to df
+    estimate = pd.DataFrame(estimate, index=r, columns=sources)
+    estimate.name = 'zscore_estimate'
+    pvals = pd.DataFrame(pvals, index=r, columns=sources)
+    pvals.name = 'zscore_pvals'
+
+    return return_data(mat=mat, results=(estimate, pvals))
diff --git a/decoupler/tests/test_zscore.py b/decoupler/tests/test_zscore.py
@@ -0,0 +1,49 @@
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+from ..method_zscore import zscore, run_zscore
+
+def test_zscore():
+    m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., 6., 2.], [-8., -7., 1., 1.]], dtype=np.float32)
+    net = np.array([[1., 0.], [1, 0.], [0., -1.], [0., -1.]], dtype=np.float32)
+    act, pvl = zscore(m, net)
+    assert act[0, 0] < 0
+    assert act[1, 0] < 0
+    assert act[2, 0] > 0
+    assert act[3, 0] > 0
+    assert act[4, 0] < 0
+    assert np.all((0. <= pvl) * (pvl <= 1.))
+    
+    act2, pvl2 = zscore(m, net, flavor='KSEA')
+    assert act2[0, 0] < 0
+    assert act2[1, 0] < 0
+    assert act2[2, 0] < 0
+    assert act2[3, 0] < 0
+    assert act2[4, 0] < 0
+    assert np.all((0. <= pvl2) * (pvl2 <= 1.))
+
+def test_run_zscore():
+    m = np.array([[-7., -1., 1., 1.], [-4., -2., 1., 2.], [1., 2., 5., 1.], [1., 1., -6., -8.], [-8., -7., 1., 1.]])
+    r = np.array(['S1', 'S2', 'S3', 'S4', 'S5'])
+    c = np.array(['G1', 'G2', 'G3', 'G4'])
+    df = pd.DataFrame(m, index=r, columns=c)
+    net = pd.DataFrame([['T1', 'G1', 1], ['T1', 'G2', 1], ['T2', 'G3', -1], ['T2', 'G4', -1]],
+                       columns=['source', 'target', 'weight'])
+    res = run_zscore(df, net, verbose=True, use_raw=False, min_n=0)
+    assert res[0].loc['S1', 'T2'] < 0
+    assert res[0].loc['S2', 'T2'] < 0
+    assert res[0].loc['S3', 'T2'] < 0
+    assert res[0].loc['S4', 'T2'] > 0
+    assert res[0].loc['S5', 'T2'] < 0
+    assert res[1].map(lambda x: 0 <= x <= 1).all().all()
+    
+    res2 = run_zscore(df, net, verbose=True, use_raw=False, min_n=0, flavor='KSEA')
+    assert res2[0].loc['S1', 'T2'] > 0
+    assert res2[0].loc['S2', 'T2'] < 0
+    assert res2[0].loc['S3', 'T2'] < 0
+    assert res2[0].loc['S4', 'T2'] > 0
+    assert res2[0].loc['S5', 'T2'] > 0
+    assert res2[1].map(lambda x: 0 <= x <= 1).all().all()
+    
+    adata = AnnData(df.astype(np.float32))
+    run_zscore(adata, net, verbose=True, use_raw=False, min_n=0)