Skip to content

Commit 1de5aef

Browse files
authored
Add files via upload
1 parent eea683a commit 1de5aef

27 files changed

Lines changed: 155767 additions & 0 deletions

tfcomb/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from importlib import import_module
2+
3+
__version__ = "1.1.1"
4+
5+
#Set classes to be available directly from upper tfcomb, i.e. "from tfcomb import CombObj"
6+
global_classes = ["tfcomb.objects.CombObj",
7+
"tfcomb.objects.DiffCombObj",
8+
"tfcomb.objects.DistObj"]
9+
10+
for c in global_classes:
11+
12+
module_name = ".".join(c.split(".")[:-1])
13+
attribute_name = c.split(".")[-1]
14+
15+
module = import_module(module_name)
16+
attribute = getattr(module, attribute_name)
17+
18+
globals()[attribute_name] = attribute
722 Bytes
Binary file not shown.
9.42 KB
Binary file not shown.
23.6 KB
Binary file not shown.
3.29 KB
Binary file not shown.
22.6 KB
Binary file not shown.
178 KB
Binary file not shown.
40 KB
Binary file not shown.
91.3 KB
Binary file not shown.

tfcomb/analysis.py

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
from __future__ import print_function
2+
3+
import pandas as pd
4+
import numpy as np
5+
from scipy.stats import chisquare
6+
import multiprocessing as mp
7+
import seaborn as sns
8+
import matplotlib.pyplot as plt
9+
10+
#Internal functions
11+
from tfcomb.logging import TFcombLogger, InputError
12+
from tfcomb.utils import check_columns, check_type, is_symmetric
13+
14+
#-------------------------------------------------------------------------------#
15+
#---------------------------- Orientation analysis -----------------------------#
16+
#-------------------------------------------------------------------------------#
17+
18+
def _get_scenario_keys(TF1, TF2, scenario):
19+
20+
scenario_keys = {
21+
"same" : [(TF1, "+", TF2, "+"),
22+
(TF2, "+", TF1, "+"),
23+
(TF1, "-", TF2, "-"),
24+
(TF2, "-", TF1, "-")],
25+
"opposite": [(TF1, "+", TF2, "-"),
26+
(TF1, "-", TF2, "+"),
27+
(TF2, "+", TF1, "-"),
28+
(TF2, "-", TF1, "+")],
29+
"TF1-TF2": [(TF1, "+", TF2, "+"),
30+
(TF2, "-", TF1, "-")],
31+
"TF2-TF1": [(TF2, "+", TF1, "+"),
32+
(TF1, "-", TF2, "-")],
33+
"convergent": [(TF1, "+", TF2, "-"),
34+
(TF2, "+", TF1, "-")],
35+
"divergent": [(TF1, "-", TF2, "+"),
36+
(TF2, "-", TF1, "+")]
37+
}
38+
39+
return(scenario_keys[scenario])
40+
41+
def _get_unique_pairs(pairs):
42+
43+
#Remove TF2-TF1 duplicates
44+
seen = {}
45+
for pair in pairs:
46+
if pair[::-1] not in seen: #the reverse TF2-TF1 pair has not been seen yet
47+
seen[pair] = ""
48+
unique = list(seen.keys())
49+
50+
return(unique)
51+
52+
def orientation(rules, verbosity=1):
53+
"""
54+
Perform orientation analysis on the TF pairs in a directional / strand-specific table. The analysis counts different scenarios depending on the input.
55+
56+
57+
If the input matrix is symmetric, the analysis contains two scenarios::
58+
59+
1. Same: ⟝(TF1+)⟶ ⟝(TF2+)⟶ = ⟝(TF2+)⟶ ⟝(TF1+)⟶ = ⟵(TF1-)⟞ ⟵(TF2-)⟞ = ⟵(TF2-)⟞ ⟵(TF1-)⟞
60+
2. Opposite: ⟝(TF1+)⟶ ⟵(TF2-)⟞ = ⟝(TF2+)⟶ ⟵(TF1-)⟞ = ⟵(TF1-)⟞ ⟝(TF2+)⟶ = ⟵(TF2-)⟞ ⟝(TF1+)⟶
61+
62+
If the input is directional, the analysis contains four different scenarios::
63+
64+
1. TF1-TF2: ⟝(TF1+)⟶ ⟝(TF2+)⟶ = ⟵(TF2-)⟞ ⟵(TF1-)⟞
65+
2. TF2-TF1: ⟝(TF2+)⟶ ⟝(TF1+)⟶ = ⟵(TF1-)⟞ ⟵(TF2-)⟞
66+
3. convergent: ⟝(TF1+)⟶ ⟵(TF2-)⟞ = ⟝(TF2+)⟶ ⟵(TF1-)⟞
67+
4. divergent: ⟵(TF1-)⟞ ⟝(TF2+)⟶ = ⟵(TF2-)⟞ ⟝(TF1+)⟶
68+
69+
Parameters
70+
----------
71+
rules : pd.DataFrame
72+
The .rules output of a CombObj analysis.
73+
verbosity : int
74+
A value between 0-3 where 0 (only errors), 1 (info), 2 (debug), 3 (spam debug). Default: 1.
75+
76+
Returns
77+
--------
78+
An OrientationAnalysis object (subclass of pd.DataFrame). The table contains frequencies of pairs related to each scenario.
79+
80+
The dataframe has the following columns:
81+
- TF1: name of the first TF in pair
82+
- TF2: name of the second TF in pair
83+
- TF1_TF2_count: The total count of TF1-TF2 co-occurring pairs
84+
- If symmetric:
85+
- Same
86+
- Opposite
87+
- If directional:
88+
- TF1_TF2
89+
- TF2_TF1
90+
- convergent
91+
- divergent
92+
- std: Standard deviation of scenario frequencies
93+
- pvalue: A chi-square test to test the hypothesis that the scenarios are equally distributed
94+
95+
"""
96+
logger = TFcombLogger(verbosity)
97+
98+
#Check that input is dataframe
99+
check_type(rules, pd.DataFrame, "rules")
100+
rules = rules.copy() #ensures that rules-table is not changed
101+
102+
#Split TF names from strands
103+
try:
104+
rules[["TF1_name", "TF1_strand"]] = rules["TF1"].str.split("(", expand=True)
105+
except Exception as e:
106+
raise InputError("Failed to split TF name from strand. Please ensure that .count_within() was run with '--stranded=True' and/or '--directional=True'.")
107+
rules["TF1_strand"] = rules["TF1_strand"].str.replace(")", "", regex=False)
108+
109+
rules[["TF2_name", "TF2_strand"]] = rules["TF2"].str.split("(", expand=True)
110+
rules["TF2_strand"] = rules["TF2_strand"].str.replace(")", "", regex=False)
111+
112+
#Establish if rules are directional
113+
rules_pivot = rules.pivot(index='TF1', columns='TF2', values='TF1_TF2_count')
114+
symmetric_bool = is_symmetric(rules_pivot)
115+
if symmetric_bool == True:
116+
scenarios = ["same", "opposite"] #2 scenarios
117+
logger.info("Rules are symmetric - scenarios counted are: {0}".format(scenarios))
118+
119+
#Remove duplicated pairs from analysis (TF1-TF2 = TF2-TF1)
120+
rules.set_index(["TF1", "TF2"], inplace=True)
121+
pairs = rules.index.tolist()
122+
unique = _get_unique_pairs(pairs)
123+
rules = rules.loc[unique,:]
124+
125+
else:
126+
scenarios = ["TF1-TF2", "TF2-TF1", "convergent", "divergent"] #4 scenarios
127+
logger.info("Rules are directional - scenarios counted are: {0}".format(scenarios))
128+
129+
#Setup count dictionary
130+
keys = [tuple(x) for x in rules[["TF1_name", "TF1_strand", "TF2_name", "TF2_strand"]].values]
131+
counts = rules["TF1_TF2_count"].tolist()
132+
count_dict = dict(zip(keys, counts))
133+
134+
#Get all possible TF1-TF2 pairs
135+
pairs = list(zip(rules["TF1_name"], rules["TF2_name"]))
136+
pairs = sorted(list(set(pairs)))
137+
pairs = _get_unique_pairs(pairs)
138+
139+
#Get counts per scenario
140+
counts = {}
141+
for (TF1, TF2) in pairs:
142+
143+
counts[(TF1, TF2)] = {} #initialize counts per pair
144+
145+
for scenario in scenarios:
146+
keys = _get_scenario_keys(TF1, TF2, scenario)
147+
keys = set(keys) #remove duplicate keys in case of TF1==TF2
148+
count = np.sum([count_dict.get(key, 0) for key in keys]) #sum of counts
149+
150+
counts[(TF1, TF2)][scenario] = count
151+
152+
#Create dataframe
153+
frame = pd.DataFrame().from_dict(counts, orient="index")
154+
frame.index.names = ["TF1", "TF2"]
155+
frame.reset_index(inplace=True)
156+
frame = frame[frame[scenarios].sum(axis=1) > 0] #remove any scenarios with sum of 0
157+
158+
#Calculate chisquare p-value (are the scenarios normally distributed?)
159+
unique = frame[scenarios].drop_duplicates() #only calculate chi-square once per seen count combination - speeds up calculation
160+
mat = unique.to_numpy()
161+
rows, cols = mat.shape
162+
pvalues = [0]*rows
163+
for row in range(rows):
164+
n = mat[row,:] #counts per scenario
165+
s, p = chisquare(n)
166+
pvalues[row] = p
167+
unique["pvalue"] = pvalues
168+
169+
#Merge unique to frame
170+
frame = frame.merge(unique, left_on=scenarios, right_on=scenarios, how="left")
171+
frame.index = frame["TF1"] + "-" + frame["TF2"]
172+
173+
#Normalize counts to sum of 1
174+
frame["TF1_TF2_count"] = frame[scenarios].sum(axis=1)
175+
for scenario in scenarios:
176+
frame[scenario] = frame[scenario] / frame["TF1_TF2_count"]
177+
frame[scenario] = frame[scenario].replace(np.inf, 0)
178+
179+
#Calculate standard deviation
180+
frame["std"] = frame[scenarios].std(axis=1)
181+
182+
#Sort by pvalue/count and reorder colums in frame
183+
frame.sort_values(["pvalue", "TF1_TF2_count"], ascending=[True, False], inplace=True)
184+
frame = frame[["TF1", "TF2", "TF1_TF2_count"] + scenarios + ["std", "pvalue"]]
185+
frame = OrientationAnalysis(frame)
186+
187+
return(frame)
188+
189+
190+
class OrientationAnalysis(pd.DataFrame):
191+
""" Analysis of the orientation of TF co-ocurring pairs """
192+
193+
@property
194+
def _constructor(self):
195+
return OrientationAnalysis
196+
197+
def plot_heatmap(self, yticklabels=False, figsize=(6,6), save=None, **kwargs):
198+
""" Plot a heatmap of orientation scenarios for the output of the orientation analysis.
199+
200+
Parameters
201+
-----------
202+
yticklabels : bool, optional
203+
Show yticklabels (TF-pairs) in plot. Default: False.
204+
figsize : tuple
205+
The size of the output heatmap. Default: (6,6)
206+
save : str, optional
207+
Save the plot to the file given in 'save'. Default: None.
208+
kwargs : arguments
209+
Any additional arguments are passed to sns.clustermap.
210+
211+
Returns
212+
--------
213+
seaborn.matrix.ClusterGrid
214+
"""
215+
216+
scenarios = [col for col in self.columns if col not in ["TF1", "TF2", "TF1_TF2_count", "std", "pvalue"]]
217+
218+
g = sns.clustermap(self[scenarios],
219+
col_cluster=False,
220+
yticklabels=yticklabels,
221+
cbar_kws={'label': "Fraction"},
222+
figsize=figsize,
223+
**kwargs)
224+
225+
n_pairs = self.shape[0]
226+
g.ax_heatmap.set_ylabel("TF-TF pairs (n={0})".format(n_pairs))
227+
228+
if save is not None:
229+
plt.savefig(save, dpi=600)
230+
231+
return(g)

0 commit comments

Comments
 (0)