Skip to content

Commit 052a878

Browse files
committed
Upload script
1 parent 074cf77 commit 052a878

File tree

1 file changed

+223
-0
lines changed

1 file changed

+223
-0
lines changed

process_cutandrun_pausingindex.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
import os
2+
import numpy as np
3+
import pandas as pd
4+
import deeptools.countReadsPerBin as crpb
5+
import matplotlib
6+
import matplotlib.pyplot as plt
7+
import pysam
8+
import scipy.stats
9+
from statsmodels.stats.multitest import multipletests
10+
from pathlib import Path
11+
12+
13+
14+
working_dir = "/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/analysis/2022-11-09"
15+
Path(working_dir).mkdir(parents=True, exist_ok=True)
16+
windowsize_TSS = 500
17+
chromosomes = ["chr"+str(x) for x in range(1,23)] + ["chrX","chrY"]
18+
19+
# Select downregulated genes (or upregulated, or background)
20+
chromosomes = [str(x) for x in range(1,23)] + ["X","Y"]
21+
#df = pd.read_csv("/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/analysis/final_48kd_ctr_annotation.txt",sep="\t")
22+
df = pd.read_csv("/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/analysis/res_TcRC_60min_kd_ctr.txt",sep=",")
23+
df.columns = ["gene_name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj"]
24+
df = df.loc[~df["pvalue"].isnull(),]
25+
df = df.loc[df["baseMean"]>5,:]
26+
df["padj"] = multipletests(df["pvalue"],alpha=0.05,method="fdr_bh")[1]
27+
28+
df_downregulated = df.loc[ (df["padj"]<0.05) & (df["log2FoldChange"]<0),:]
29+
genes_selected_downregulated = list(df_downregulated["gene_name"])
30+
31+
df_upregulated = df.loc[ (df["padj"]<0.05) & (df["log2FoldChange"]>0),:]
32+
genes_selected_upregulated = list(df_upregulated["gene_name"])
33+
34+
df_background = df.loc[ (df["baseMean"]>5),:] # (df["padj"]>0.20) &
35+
genes_selected_background = list(df_background["gene_name"])
36+
37+
# Find canonical transcripts for the selected genes
38+
#transcripts_ids = set()
39+
canonicalTranscripts2gene_downregulated = {}
40+
canonicalTranscripts2gene_upregulated = {}
41+
canonicalTranscripts2gene_background = {}
42+
df_GRCh38 = pd.read_csv("/omics/groups/OE0219/internal/Etienne/data/reference/RNA/Transcripts_GRCh38.tsv",sep="\t")
43+
for i in range(df_GRCh38.shape[0]):
44+
if df_GRCh38.loc[i,"Ensembl Canonical"] == 1.0:
45+
if df_GRCh38.loc[i,"Gene name"] in genes_selected_downregulated and not df_GRCh38.loc[i,"Gene name"] in canonicalTranscripts2gene_downregulated.values():
46+
canonicalTranscripts2gene_downregulated[df_GRCh38.loc[i,"Transcript stable ID"]] = df_GRCh38.loc[i,"Gene name"]
47+
if df_GRCh38.loc[i,"Gene name"] in genes_selected_upregulated and not df_GRCh38.loc[i,"Gene name"] in canonicalTranscripts2gene_upregulated.values():
48+
canonicalTranscripts2gene_upregulated[df_GRCh38.loc[i,"Transcript stable ID"]] = df_GRCh38.loc[i,"Gene name"]
49+
if df_GRCh38.loc[i,"Gene name"] in genes_selected_background and not df_GRCh38.loc[i,"Gene name"] in canonicalTranscripts2gene_background.values():
50+
canonicalTranscripts2gene_background[df_GRCh38.loc[i,"Transcript stable ID"]] = df_GRCh38.loc[i,"Gene name"]
51+
52+
df_GRCh37 = pd.read_csv("/omics/groups/OE0219/internal/Etienne/data/reference/RNA/Transcripts_GRCh37.tsv",sep="\t")
53+
df_GRCh37.sort_values(by=["Chromosome/scaffold name","Transcript start (bp)"],inplace=True)
54+
df_GRCh37.reset_index(inplace=True)
55+
56+
with open(os.path.join(working_dir,"selected_genes_downregulated.bed"),"w") as outfile:
57+
for i in range(df_GRCh37.shape[0]):
58+
if df_GRCh37.loc[i,"Transcript stable ID"] in canonicalTranscripts2gene_downregulated and df_GRCh37.loc[i,"Chromosome/scaffold name"] in chromosomes:
59+
strand = "+" if df_GRCh37.loc[i,"Strand"] >0 else "-"
60+
tmp = outfile.write("\t".join(["chr"+str(df_GRCh37.loc[i,"Chromosome/scaffold name"]),str(df_GRCh37.loc[i,"Transcript start (bp)"]),str(df_GRCh37.loc[i,"Transcript end (bp)"]),canonicalTranscripts2gene_downregulated[df_GRCh37.loc[i,"Transcript stable ID"]],"0",strand])+"\n")
61+
62+
with open(os.path.join(working_dir,"selected_genes_upregulated.bed"),"w") as outfile:
63+
for i in range(df_GRCh37.shape[0]):
64+
if df_GRCh37.loc[i,"Transcript stable ID"] in canonicalTranscripts2gene_upregulated and df_GRCh37.loc[i,"Chromosome/scaffold name"] in chromosomes:
65+
strand = "+" if df_GRCh37.loc[i,"Strand"] >0 else "-"
66+
tmp = outfile.write("\t".join(["chr"+str(df_GRCh37.loc[i,"Chromosome/scaffold name"]),str(df_GRCh37.loc[i,"Transcript start (bp)"]),str(df_GRCh37.loc[i,"Transcript end (bp)"]),canonicalTranscripts2gene_upregulated[df_GRCh37.loc[i,"Transcript stable ID"]],"0",strand])+"\n")
67+
68+
with open(os.path.join(working_dir,"selected_genes_background.bed"),"w") as outfile:
69+
for i in range(df_GRCh37.shape[0]):
70+
if df_GRCh37.loc[i,"Transcript stable ID"] in canonicalTranscripts2gene_background and df_GRCh37.loc[i,"Chromosome/scaffold name"] in chromosomes:
71+
strand = "+" if df_GRCh37.loc[i,"Strand"] >0 else "-"
72+
tmp = outfile.write("\t".join(["chr"+str(df_GRCh37.loc[i,"Chromosome/scaffold name"]),str(df_GRCh37.loc[i,"Transcript start (bp)"]),str(df_GRCh37.loc[i,"Transcript end (bp)"]),canonicalTranscripts2gene_background[df_GRCh37.loc[i,"Transcript stable ID"]],"0",strand])+"\n")
73+
74+
75+
76+
77+
# Get dataframe of pausing indices transcripts x sample
78+
antibody = "phospho-Rpb1-ser5"
79+
replicates = ["1","2","3"]
80+
pipeline_dir = "/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/pipeline2_RPKM/"
81+
82+
#antibody = "RPB1"
83+
#replicates = ["3","4","5"]
84+
#pipeline_dir = "/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/pipeline_RPKM/"
85+
86+
sample2bam={}
87+
sample2pysam={}
88+
sample2counter={}
89+
for condition in ["KD48h","control"]:
90+
for replicate in replicates:
91+
sample = antibody+"_"+condition+"_R"+replicate
92+
sample2bam[sample] = pipeline_dir+"02_alignment/bowtie2/target/"+condition+"-"+antibody+"_R"+replicate+".target.markdup.bam"
93+
sample2pysam[sample] = pysam.AlignmentFile(sample2bam[sample])
94+
sample2counter[sample] = crpb.CountReadsPerBin([sample2bam[sample]], binLength=100, stepSize=100)
95+
96+
chromosomes = ["chr"+str(x) for x in range(1,23)] + ["chrX","chrY"]
97+
98+
def compute_pausing_indices(genes_file,outfile):
99+
# We define the pausing index as the number of reads aligned to the TSS (+/- 250bp) divided by the number of reads aligned to the gene body.
100+
d={"gene":[]}
101+
for sample in sample2bam:
102+
d[sample] = []
103+
with open(genes_file,"r") as infile:
104+
for line in infile:
105+
linesplit= line.rstrip("\n").split("\t")
106+
chr = linesplit[0]
107+
if not chr in chromosomes: continue
108+
start = int(linesplit[1])
109+
end = int(linesplit[2])
110+
gene = linesplit[3]
111+
strand = linesplit[5]
112+
if strand=="+":
113+
TSS = start
114+
else:
115+
TSS = end
116+
d["gene"].append(gene)
117+
for condition in ["KD48h","control"]:
118+
for replicate in replicates:
119+
sample = antibody+"_"+condition+"_R"+replicate
120+
coverage_TSS = 1+sample2counter[sample].get_coverage_of_region(sample2pysam[sample],chr, [(TSS-windowsize_TSS//2, TSS+windowsize_TSS//2)])[0]
121+
coverage_genebody = 1+sample2counter[sample].get_coverage_of_region(sample2pysam[sample],chr, [(start -windowsize_TSS//2 , end +windowsize_TSS//2)])[0]
122+
ratio = coverage_TSS / coverage_genebody
123+
d[sample].append(ratio)
124+
df_pausing_indices = pd.DataFrame(d)
125+
df_pausing_indices.to_csv(outfile,sep="\t",index=False)
126+
127+
compute_pausing_indices(os.path.join(working_dir,"selected_genes_downregulated.bed"),os.path.join(working_dir,antibody+"_pausing-indices_downregulated.tsv"))
128+
compute_pausing_indices(os.path.join(working_dir,"selected_genes_upregulated.bed"),os.path.join(working_dir,antibody+"_pausing-indices_upregulated.tsv"))
129+
compute_pausing_indices(os.path.join(working_dir,"selected_genes_background.bed"),os.path.join(working_dir,antibody+"_pausing-indices_background.tsv"))
130+
131+
132+
133+
#######################################################
134+
# Compute pvalues for differences in pausing index between the two conditions
135+
136+
for type in ["downregulated","upregulated","background"]:
137+
print(type)
138+
df_pausing_indices = pd.read_csv(os.path.join(working_dir,antibody+"_pausing-indices_"+type+".tsv"),sep="\t",index_col = "gene")
139+
conditions = ["KD48h","control"]
140+
141+
142+
# For each gene, perform a t-test between the pausing index of 2 conditions (requires several replicates per condition).
143+
# Then, combine the pvalues of all the genes into a single pvalue
144+
145+
# Alternatively, for each gene, compute the log ratio of the pausing index between the two conditions, and then perform a t-test for all genes
146+
pvalues = []
147+
log_ratios_conditions=[]
148+
diff_ratios_conditions=[]
149+
150+
for gene in df_pausing_indices.index:
151+
# Collect the pausing indices for this gene, for each of the 2 conditions
152+
pausing_indices={}
153+
for condition in conditions:
154+
pausing_indices[condition]=[]
155+
for replicate in replicates:
156+
pausing_indices[condition].append(df_pausing_indices.loc[gene,antibody+"_"+condition+"_R"+str(replicate)])
157+
# t-test between the two conditions
158+
pvalue = scipy.stats.ttest_ind(pausing_indices[conditions[0]],pausing_indices[conditions[1]])[1] # ,alternative="greater"
159+
if pvalue == pvalue:
160+
pvalues.append(pvalue)
161+
162+
# log ratio between the 2 conditions
163+
log_ratios_conditions.append(np.log(np.mean(pausing_indices[conditions[0]]) / np.mean(pausing_indices[conditions[1]])))
164+
diff_ratios_conditions.append(np.mean(pausing_indices[conditions[0]]) - np.mean(pausing_indices[conditions[1]]))
165+
166+
ratios_condition = {}
167+
for condition in conditions:
168+
ratios_condition[condition] = []
169+
for replicate in replicates:
170+
ratios_replicate = []
171+
for gene in df_pausing_indices.index:
172+
ratios_replicate.append(df_pausing_indices.loc[gene,antibody+"_"+condition+"_R"+str(replicate)])
173+
ratios_condition[condition].append(np.mean(ratios_replicate))
174+
175+
pvalue_samples = scipy.stats.ttest_ind(ratios_condition[conditions[0]],ratios_condition[conditions[1]])[1]
176+
print(ratios_condition)
177+
print("p value:" +str(pvalue_samples))
178+
179+
# use Fisher's method to combine the pvalues.
180+
print(scipy.stats.combine_pvalues(pvalues))
181+
pvalue_combined = scipy.stats.combine_pvalues(pvalues)[1]
182+
print("pvalue combined: "+str(pvalue_combined))
183+
184+
185+
# t-test between the two conditions
186+
pvalue2 = scipy.stats.ttest_1samp(diff_ratios_conditions,0)[1] # alternative="greater"
187+
print("pvalue diff: "+str(pvalue2))
188+
pvalue3 = scipy.stats.ttest_1samp(log_ratios_conditions,0)[1] # alternative="greater"
189+
print("pvalue log ratio: "+str(pvalue3))
190+
191+
matplotlib.rcParams.update({'font.size': 20})
192+
#plt.hist(log_ratios_conditions,bins=np.arange(-1.6,1.6,0.1),density=False)
193+
plt.hist(diff_ratios_conditions,bins=np.arange(-0.2,0.2,0.01),density=False)
194+
plt.xlabel("Difference of pausing index between KD and control")
195+
plt.ylabel("Number of transcripts")
196+
plt.title("pvalue: "+ "{:.2E}".format(pvalue2))
197+
plt.savefig(os.path.join(working_dir,antibody+"_ttest_"+type+".svg"),bbox_inches="tight")
198+
plt.cla()
199+
plt.clf()
200+
plt.close('all')
201+
series_diff_ratio = pd.Series(diff_ratios_conditions,index=df_pausing_indices.index)
202+
series_diff_ratio.to_csv(os.path.join(working_dir,antibody+"_diff_pausing-indices_"+type+".tsv"),sep="\t")
203+
204+
matplotlib.rcParams.update({'font.size': 20})
205+
#plt.hist(log_ratios_conditions,bins=np.arange(-1.6,1.6,0.1),density=False)
206+
plt.hist(log_ratios_conditions,bins=np.arange(-0.2,0.2,0.01),density=False)
207+
plt.xlabel("Log ratio of pausing index between KD and control")
208+
plt.ylabel("Number of transcripts")
209+
plt.title("pvalue: "+ "{:.2E}".format(pvalue3))
210+
plt.savefig(os.path.join(working_dir,antibody+"_ttestLogRatio_"+type+".svg"),bbox_inches="tight")
211+
plt.cla()
212+
plt.clf()
213+
plt.close('all')
214+
215+
216+
plt.hist(pvalues)
217+
plt.xlabel("pvalue")
218+
plt.ylabel("Number of transcripts")
219+
plt.title("Combined pvalue:" + "{:.2E}".format(pvalue_combined))
220+
plt.savefig(os.path.join(working_dir,antibody+"_pvalues_"+type+".svg"),bbox_inches="tight")
221+
plt.cla()
222+
plt.clf()
223+
plt.close('all')

0 commit comments

Comments
 (0)