|
| 1 | +from __future__ import print_function |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import numpy as np |
| 5 | +from scipy.stats import chisquare |
| 6 | +import multiprocessing as mp |
| 7 | +import seaborn as sns |
| 8 | +import matplotlib.pyplot as plt |
| 9 | + |
| 10 | +#Internal functions |
| 11 | +from tfcomb.logging import TFcombLogger, InputError |
| 12 | +from tfcomb.utils import check_columns, check_type, is_symmetric |
| 13 | + |
| 14 | +#-------------------------------------------------------------------------------# |
| 15 | +#---------------------------- Orientation analysis -----------------------------# |
| 16 | +#-------------------------------------------------------------------------------# |
| 17 | + |
| 18 | +def _get_scenario_keys(TF1, TF2, scenario): |
| 19 | + |
| 20 | + scenario_keys = { |
| 21 | + "same" : [(TF1, "+", TF2, "+"), |
| 22 | + (TF2, "+", TF1, "+"), |
| 23 | + (TF1, "-", TF2, "-"), |
| 24 | + (TF2, "-", TF1, "-")], |
| 25 | + "opposite": [(TF1, "+", TF2, "-"), |
| 26 | + (TF1, "-", TF2, "+"), |
| 27 | + (TF2, "+", TF1, "-"), |
| 28 | + (TF2, "-", TF1, "+")], |
| 29 | + "TF1-TF2": [(TF1, "+", TF2, "+"), |
| 30 | + (TF2, "-", TF1, "-")], |
| 31 | + "TF2-TF1": [(TF2, "+", TF1, "+"), |
| 32 | + (TF1, "-", TF2, "-")], |
| 33 | + "convergent": [(TF1, "+", TF2, "-"), |
| 34 | + (TF2, "+", TF1, "-")], |
| 35 | + "divergent": [(TF1, "-", TF2, "+"), |
| 36 | + (TF2, "-", TF1, "+")] |
| 37 | + } |
| 38 | + |
| 39 | + return(scenario_keys[scenario]) |
| 40 | + |
| 41 | +def _get_unique_pairs(pairs): |
| 42 | + |
| 43 | + #Remove TF2-TF1 duplicates |
| 44 | + seen = {} |
| 45 | + for pair in pairs: |
| 46 | + if pair[::-1] not in seen: #the reverse TF2-TF1 pair has not been seen yet |
| 47 | + seen[pair] = "" |
| 48 | + unique = list(seen.keys()) |
| 49 | + |
| 50 | + return(unique) |
| 51 | + |
| 52 | +def orientation(rules, verbosity=1): |
| 53 | + """ |
| 54 | + Perform orientation analysis on the TF pairs in a directional / strand-specific table. The analysis counts different scenarios depending on the input. |
| 55 | + |
| 56 | +
|
| 57 | + If the input matrix is symmetric, the analysis contains two scenarios:: |
| 58 | +
|
| 59 | + 1. Same: ⟝(TF1+)⟶ ⟝(TF2+)⟶ = ⟝(TF2+)⟶ ⟝(TF1+)⟶ = ⟵(TF1-)⟞ ⟵(TF2-)⟞ = ⟵(TF2-)⟞ ⟵(TF1-)⟞ |
| 60 | + 2. Opposite: ⟝(TF1+)⟶ ⟵(TF2-)⟞ = ⟝(TF2+)⟶ ⟵(TF1-)⟞ = ⟵(TF1-)⟞ ⟝(TF2+)⟶ = ⟵(TF2-)⟞ ⟝(TF1+)⟶ |
| 61 | +
|
| 62 | + If the input is directional, the analysis contains four different scenarios:: |
| 63 | +
|
| 64 | + 1. TF1-TF2: ⟝(TF1+)⟶ ⟝(TF2+)⟶ = ⟵(TF2-)⟞ ⟵(TF1-)⟞ |
| 65 | + 2. TF2-TF1: ⟝(TF2+)⟶ ⟝(TF1+)⟶ = ⟵(TF1-)⟞ ⟵(TF2-)⟞ |
| 66 | + 3. convergent: ⟝(TF1+)⟶ ⟵(TF2-)⟞ = ⟝(TF2+)⟶ ⟵(TF1-)⟞ |
| 67 | + 4. divergent: ⟵(TF1-)⟞ ⟝(TF2+)⟶ = ⟵(TF2-)⟞ ⟝(TF1+)⟶ |
| 68 | +
|
| 69 | + Parameters |
| 70 | + ---------- |
| 71 | + rules : pd.DataFrame |
| 72 | + The .rules output of a CombObj analysis. |
| 73 | + verbosity : int |
| 74 | + A value between 0-3 where 0 (only errors), 1 (info), 2 (debug), 3 (spam debug). Default: 1. |
| 75 | +
|
| 76 | + Returns |
| 77 | + -------- |
| 78 | + An OrientationAnalysis object (subclass of pd.DataFrame). The table contains frequencies of pairs related to each scenario. |
| 79 | +
|
| 80 | + The dataframe has the following columns: |
| 81 | + - TF1: name of the first TF in pair |
| 82 | + - TF2: name of the second TF in pair |
| 83 | + - TF1_TF2_count: The total count of TF1-TF2 co-occurring pairs |
| 84 | + - If symmetric: |
| 85 | + - Same |
| 86 | + - Opposite |
| 87 | + - If directional: |
| 88 | + - TF1_TF2 |
| 89 | + - TF2_TF1 |
| 90 | + - convergent |
| 91 | + - divergent |
| 92 | + - std: Standard deviation of scenario frequencies |
| 93 | + - pvalue: A chi-square test to test the hypothesis that the scenarios are equally distributed |
| 94 | +
|
| 95 | + """ |
| 96 | + logger = TFcombLogger(verbosity) |
| 97 | + |
| 98 | + #Check that input is dataframe |
| 99 | + check_type(rules, pd.DataFrame, "rules") |
| 100 | + rules = rules.copy() #ensures that rules-table is not changed |
| 101 | + |
| 102 | + #Split TF names from strands |
| 103 | + try: |
| 104 | + rules[["TF1_name", "TF1_strand"]] = rules["TF1"].str.split("(", expand=True) |
| 105 | + except Exception as e: |
| 106 | + raise InputError("Failed to split TF name from strand. Please ensure that .count_within() was run with '--stranded=True' and/or '--directional=True'.") |
| 107 | + rules["TF1_strand"] = rules["TF1_strand"].str.replace(")", "", regex=False) |
| 108 | + |
| 109 | + rules[["TF2_name", "TF2_strand"]] = rules["TF2"].str.split("(", expand=True) |
| 110 | + rules["TF2_strand"] = rules["TF2_strand"].str.replace(")", "", regex=False) |
| 111 | + |
| 112 | + #Establish if rules are directional |
| 113 | + rules_pivot = rules.pivot(index='TF1', columns='TF2', values='TF1_TF2_count') |
| 114 | + symmetric_bool = is_symmetric(rules_pivot) |
| 115 | + if symmetric_bool == True: |
| 116 | + scenarios = ["same", "opposite"] #2 scenarios |
| 117 | + logger.info("Rules are symmetric - scenarios counted are: {0}".format(scenarios)) |
| 118 | + |
| 119 | + #Remove duplicated pairs from analysis (TF1-TF2 = TF2-TF1) |
| 120 | + rules.set_index(["TF1", "TF2"], inplace=True) |
| 121 | + pairs = rules.index.tolist() |
| 122 | + unique = _get_unique_pairs(pairs) |
| 123 | + rules = rules.loc[unique,:] |
| 124 | + |
| 125 | + else: |
| 126 | + scenarios = ["TF1-TF2", "TF2-TF1", "convergent", "divergent"] #4 scenarios |
| 127 | + logger.info("Rules are directional - scenarios counted are: {0}".format(scenarios)) |
| 128 | + |
| 129 | + #Setup count dictionary |
| 130 | + keys = [tuple(x) for x in rules[["TF1_name", "TF1_strand", "TF2_name", "TF2_strand"]].values] |
| 131 | + counts = rules["TF1_TF2_count"].tolist() |
| 132 | + count_dict = dict(zip(keys, counts)) |
| 133 | + |
| 134 | + #Get all possible TF1-TF2 pairs |
| 135 | + pairs = list(zip(rules["TF1_name"], rules["TF2_name"])) |
| 136 | + pairs = sorted(list(set(pairs))) |
| 137 | + pairs = _get_unique_pairs(pairs) |
| 138 | + |
| 139 | + #Get counts per scenario |
| 140 | + counts = {} |
| 141 | + for (TF1, TF2) in pairs: |
| 142 | + |
| 143 | + counts[(TF1, TF2)] = {} #initialize counts per pair |
| 144 | + |
| 145 | + for scenario in scenarios: |
| 146 | + keys = _get_scenario_keys(TF1, TF2, scenario) |
| 147 | + keys = set(keys) #remove duplicate keys in case of TF1==TF2 |
| 148 | + count = np.sum([count_dict.get(key, 0) for key in keys]) #sum of counts |
| 149 | + |
| 150 | + counts[(TF1, TF2)][scenario] = count |
| 151 | + |
| 152 | + #Create dataframe |
| 153 | + frame = pd.DataFrame().from_dict(counts, orient="index") |
| 154 | + frame.index.names = ["TF1", "TF2"] |
| 155 | + frame.reset_index(inplace=True) |
| 156 | + frame = frame[frame[scenarios].sum(axis=1) > 0] #remove any scenarios with sum of 0 |
| 157 | + |
| 158 | + #Calculate chisquare p-value (are the scenarios normally distributed?) |
| 159 | + unique = frame[scenarios].drop_duplicates() #only calculate chi-square once per seen count combination - speeds up calculation |
| 160 | + mat = unique.to_numpy() |
| 161 | + rows, cols = mat.shape |
| 162 | + pvalues = [0]*rows |
| 163 | + for row in range(rows): |
| 164 | + n = mat[row,:] #counts per scenario |
| 165 | + s, p = chisquare(n) |
| 166 | + pvalues[row] = p |
| 167 | + unique["pvalue"] = pvalues |
| 168 | + |
| 169 | + #Merge unique to frame |
| 170 | + frame = frame.merge(unique, left_on=scenarios, right_on=scenarios, how="left") |
| 171 | + frame.index = frame["TF1"] + "-" + frame["TF2"] |
| 172 | + |
| 173 | + #Normalize counts to sum of 1 |
| 174 | + frame["TF1_TF2_count"] = frame[scenarios].sum(axis=1) |
| 175 | + for scenario in scenarios: |
| 176 | + frame[scenario] = frame[scenario] / frame["TF1_TF2_count"] |
| 177 | + frame[scenario] = frame[scenario].replace(np.inf, 0) |
| 178 | + |
| 179 | + #Calculate standard deviation |
| 180 | + frame["std"] = frame[scenarios].std(axis=1) |
| 181 | + |
| 182 | + #Sort by pvalue/count and reorder colums in frame |
| 183 | + frame.sort_values(["pvalue", "TF1_TF2_count"], ascending=[True, False], inplace=True) |
| 184 | + frame = frame[["TF1", "TF2", "TF1_TF2_count"] + scenarios + ["std", "pvalue"]] |
| 185 | + frame = OrientationAnalysis(frame) |
| 186 | + |
| 187 | + return(frame) |
| 188 | + |
| 189 | + |
| 190 | +class OrientationAnalysis(pd.DataFrame): |
| 191 | + """ Analysis of the orientation of TF co-ocurring pairs """ |
| 192 | + |
| 193 | + @property |
| 194 | + def _constructor(self): |
| 195 | + return OrientationAnalysis |
| 196 | + |
| 197 | + def plot_heatmap(self, yticklabels=False, figsize=(6,6), save=None, **kwargs): |
| 198 | + """ Plot a heatmap of orientation scenarios for the output of the orientation analysis. |
| 199 | + |
| 200 | + Parameters |
| 201 | + ----------- |
| 202 | + yticklabels : bool, optional |
| 203 | + Show yticklabels (TF-pairs) in plot. Default: False. |
| 204 | + figsize : tuple |
| 205 | + The size of the output heatmap. Default: (6,6) |
| 206 | + save : str, optional |
| 207 | + Save the plot to the file given in 'save'. Default: None. |
| 208 | + kwargs : arguments |
| 209 | + Any additional arguments are passed to sns.clustermap. |
| 210 | +
|
| 211 | + Returns |
| 212 | + -------- |
| 213 | + seaborn.matrix.ClusterGrid |
| 214 | + """ |
| 215 | + |
| 216 | + scenarios = [col for col in self.columns if col not in ["TF1", "TF2", "TF1_TF2_count", "std", "pvalue"]] |
| 217 | + |
| 218 | + g = sns.clustermap(self[scenarios], |
| 219 | + col_cluster=False, |
| 220 | + yticklabels=yticklabels, |
| 221 | + cbar_kws={'label': "Fraction"}, |
| 222 | + figsize=figsize, |
| 223 | + **kwargs) |
| 224 | + |
| 225 | + n_pairs = self.shape[0] |
| 226 | + g.ax_heatmap.set_ylabel("TF-TF pairs (n={0})".format(n_pairs)) |
| 227 | + |
| 228 | + if save is not None: |
| 229 | + plt.savefig(save, dpi=600) |
| 230 | + |
| 231 | + return(g) |
0 commit comments