1+ import os
2+ import numpy as np
3+ import pandas as pd
4+ import deeptools .countReadsPerBin as crpb
5+ import matplotlib
6+ import matplotlib .pyplot as plt
7+ import pysam
8+ import scipy .stats
9+ from statsmodels .stats .multitest import multipletests
10+ from pathlib import Path
11+
12+
13+
14+ working_dir = "/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/analysis/2022-11-09"
15+ Path (working_dir ).mkdir (parents = True , exist_ok = True )
16+ windowsize_TSS = 500
17+ chromosomes = ["chr" + str (x ) for x in range (1 ,23 )] + ["chrX" ,"chrY" ]
18+
19+ # Select downregulated genes (or upregulated, or background)
20+ chromosomes = [str (x ) for x in range (1 ,23 )] + ["X" ,"Y" ]
21+ #df = pd.read_csv("/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/analysis/final_48kd_ctr_annotation.txt",sep="\t")
22+ df = pd .read_csv ("/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/analysis/res_TcRC_60min_kd_ctr.txt" ,sep = "," )
23+ df .columns = ["gene_name" ,"baseMean" ,"log2FoldChange" ,"lfcSE" ,"stat" ,"pvalue" ,"padj" ]
24+ df = df .loc [~ df ["pvalue" ].isnull (),]
25+ df = df .loc [df ["baseMean" ]> 5 ,:]
26+ df ["padj" ] = multipletests (df ["pvalue" ],alpha = 0.05 ,method = "fdr_bh" )[1 ]
27+
28+ df_downregulated = df .loc [ (df ["padj" ]< 0.05 ) & (df ["log2FoldChange" ]< 0 ),:]
29+ genes_selected_downregulated = list (df_downregulated ["gene_name" ])
30+
31+ df_upregulated = df .loc [ (df ["padj" ]< 0.05 ) & (df ["log2FoldChange" ]> 0 ),:]
32+ genes_selected_upregulated = list (df_upregulated ["gene_name" ])
33+
34+ df_background = df .loc [ (df ["baseMean" ]> 5 ),:] # (df["padj"]>0.20) &
35+ genes_selected_background = list (df_background ["gene_name" ])
36+
37+ # Find canonical transcripts for the selected genes
38+ #transcripts_ids = set()
39+ canonicalTranscripts2gene_downregulated = {}
40+ canonicalTranscripts2gene_upregulated = {}
41+ canonicalTranscripts2gene_background = {}
42+ df_GRCh38 = pd .read_csv ("/omics/groups/OE0219/internal/Etienne/data/reference/RNA/Transcripts_GRCh38.tsv" ,sep = "\t " )
43+ for i in range (df_GRCh38 .shape [0 ]):
44+ if df_GRCh38 .loc [i ,"Ensembl Canonical" ] == 1.0 :
45+ if df_GRCh38 .loc [i ,"Gene name" ] in genes_selected_downregulated and not df_GRCh38 .loc [i ,"Gene name" ] in canonicalTranscripts2gene_downregulated .values ():
46+ canonicalTranscripts2gene_downregulated [df_GRCh38 .loc [i ,"Transcript stable ID" ]] = df_GRCh38 .loc [i ,"Gene name" ]
47+ if df_GRCh38 .loc [i ,"Gene name" ] in genes_selected_upregulated and not df_GRCh38 .loc [i ,"Gene name" ] in canonicalTranscripts2gene_upregulated .values ():
48+ canonicalTranscripts2gene_upregulated [df_GRCh38 .loc [i ,"Transcript stable ID" ]] = df_GRCh38 .loc [i ,"Gene name" ]
49+ if df_GRCh38 .loc [i ,"Gene name" ] in genes_selected_background and not df_GRCh38 .loc [i ,"Gene name" ] in canonicalTranscripts2gene_background .values ():
50+ canonicalTranscripts2gene_background [df_GRCh38 .loc [i ,"Transcript stable ID" ]] = df_GRCh38 .loc [i ,"Gene name" ]
51+
52+ df_GRCh37 = pd .read_csv ("/omics/groups/OE0219/internal/Etienne/data/reference/RNA/Transcripts_GRCh37.tsv" ,sep = "\t " )
53+ df_GRCh37 .sort_values (by = ["Chromosome/scaffold name" ,"Transcript start (bp)" ],inplace = True )
54+ df_GRCh37 .reset_index (inplace = True )
55+
56+ with open (os .path .join (working_dir ,"selected_genes_downregulated.bed" ),"w" ) as outfile :
57+ for i in range (df_GRCh37 .shape [0 ]):
58+ if df_GRCh37 .loc [i ,"Transcript stable ID" ] in canonicalTranscripts2gene_downregulated and df_GRCh37 .loc [i ,"Chromosome/scaffold name" ] in chromosomes :
59+ strand = "+" if df_GRCh37 .loc [i ,"Strand" ] > 0 else "-"
60+ tmp = outfile .write ("\t " .join (["chr" + str (df_GRCh37 .loc [i ,"Chromosome/scaffold name" ]),str (df_GRCh37 .loc [i ,"Transcript start (bp)" ]),str (df_GRCh37 .loc [i ,"Transcript end (bp)" ]),canonicalTranscripts2gene_downregulated [df_GRCh37 .loc [i ,"Transcript stable ID" ]],"0" ,strand ])+ "\n " )
61+
62+ with open (os .path .join (working_dir ,"selected_genes_upregulated.bed" ),"w" ) as outfile :
63+ for i in range (df_GRCh37 .shape [0 ]):
64+ if df_GRCh37 .loc [i ,"Transcript stable ID" ] in canonicalTranscripts2gene_upregulated and df_GRCh37 .loc [i ,"Chromosome/scaffold name" ] in chromosomes :
65+ strand = "+" if df_GRCh37 .loc [i ,"Strand" ] > 0 else "-"
66+ tmp = outfile .write ("\t " .join (["chr" + str (df_GRCh37 .loc [i ,"Chromosome/scaffold name" ]),str (df_GRCh37 .loc [i ,"Transcript start (bp)" ]),str (df_GRCh37 .loc [i ,"Transcript end (bp)" ]),canonicalTranscripts2gene_upregulated [df_GRCh37 .loc [i ,"Transcript stable ID" ]],"0" ,strand ])+ "\n " )
67+
68+ with open (os .path .join (working_dir ,"selected_genes_background.bed" ),"w" ) as outfile :
69+ for i in range (df_GRCh37 .shape [0 ]):
70+ if df_GRCh37 .loc [i ,"Transcript stable ID" ] in canonicalTranscripts2gene_background and df_GRCh37 .loc [i ,"Chromosome/scaffold name" ] in chromosomes :
71+ strand = "+" if df_GRCh37 .loc [i ,"Strand" ] > 0 else "-"
72+ tmp = outfile .write ("\t " .join (["chr" + str (df_GRCh37 .loc [i ,"Chromosome/scaffold name" ]),str (df_GRCh37 .loc [i ,"Transcript start (bp)" ]),str (df_GRCh37 .loc [i ,"Transcript end (bp)" ]),canonicalTranscripts2gene_background [df_GRCh37 .loc [i ,"Transcript stable ID" ]],"0" ,strand ])+ "\n " )
73+
74+
75+
76+
77+ # Get dataframe of pausing indices transcripts x sample
78+ antibody = "phospho-Rpb1-ser5"
79+ replicates = ["1" ,"2" ,"3" ]
80+ pipeline_dir = "/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/pipeline2_RPKM/"
81+
82+ #antibody = "RPB1"
83+ #replicates = ["3","4","5"]
84+ #pipeline_dir = "/omics/groups/OE0219/internal/Etienne/Projects/SRSF2/cutandrun/pipeline_RPKM/"
85+
86+ sample2bam = {}
87+ sample2pysam = {}
88+ sample2counter = {}
89+ for condition in ["KD48h" ,"control" ]:
90+ for replicate in replicates :
91+ sample = antibody + "_" + condition + "_R" + replicate
92+ sample2bam [sample ] = pipeline_dir + "02_alignment/bowtie2/target/" + condition + "-" + antibody + "_R" + replicate + ".target.markdup.bam"
93+ sample2pysam [sample ] = pysam .AlignmentFile (sample2bam [sample ])
94+ sample2counter [sample ] = crpb .CountReadsPerBin ([sample2bam [sample ]], binLength = 100 , stepSize = 100 )
95+
96+ chromosomes = ["chr" + str (x ) for x in range (1 ,23 )] + ["chrX" ,"chrY" ]
97+
98+ def compute_pausing_indices (genes_file ,outfile ):
99+ # We define the pausing index as the number of reads aligned to the TSS (+/- 250bp) divided by the number of reads aligned to the gene body.
100+ d = {"gene" :[]}
101+ for sample in sample2bam :
102+ d [sample ] = []
103+ with open (genes_file ,"r" ) as infile :
104+ for line in infile :
105+ linesplit = line .rstrip ("\n " ).split ("\t " )
106+ chr = linesplit [0 ]
107+ if not chr in chromosomes : continue
108+ start = int (linesplit [1 ])
109+ end = int (linesplit [2 ])
110+ gene = linesplit [3 ]
111+ strand = linesplit [5 ]
112+ if strand == "+" :
113+ TSS = start
114+ else :
115+ TSS = end
116+ d ["gene" ].append (gene )
117+ for condition in ["KD48h" ,"control" ]:
118+ for replicate in replicates :
119+ sample = antibody + "_" + condition + "_R" + replicate
120+ coverage_TSS = 1 + sample2counter [sample ].get_coverage_of_region (sample2pysam [sample ],chr , [(TSS - windowsize_TSS // 2 , TSS + windowsize_TSS // 2 )])[0 ]
121+ coverage_genebody = 1 + sample2counter [sample ].get_coverage_of_region (sample2pysam [sample ],chr , [(start - windowsize_TSS // 2 , end + windowsize_TSS // 2 )])[0 ]
122+ ratio = coverage_TSS / coverage_genebody
123+ d [sample ].append (ratio )
124+ df_pausing_indices = pd .DataFrame (d )
125+ df_pausing_indices .to_csv (outfile ,sep = "\t " ,index = False )
126+
127+ compute_pausing_indices (os .path .join (working_dir ,"selected_genes_downregulated.bed" ),os .path .join (working_dir ,antibody + "_pausing-indices_downregulated.tsv" ))
128+ compute_pausing_indices (os .path .join (working_dir ,"selected_genes_upregulated.bed" ),os .path .join (working_dir ,antibody + "_pausing-indices_upregulated.tsv" ))
129+ compute_pausing_indices (os .path .join (working_dir ,"selected_genes_background.bed" ),os .path .join (working_dir ,antibody + "_pausing-indices_background.tsv" ))
130+
131+
132+
133+ #######################################################
134+ # Compute pvalues for differences in pausing index between the two conditions
135+
136+ for type in ["downregulated" ,"upregulated" ,"background" ]:
137+ print (type )
138+ df_pausing_indices = pd .read_csv (os .path .join (working_dir ,antibody + "_pausing-indices_" + type + ".tsv" ),sep = "\t " ,index_col = "gene" )
139+ conditions = ["KD48h" ,"control" ]
140+
141+
142+ # For each gene, perform a t-test between the pausing index of 2 conditions (requires several replicates per condition).
143+ # Then, combine the pvalues of all the genes into a single pvalue
144+
145+ # Alternatively, for each gene, compute the log ratio of the pausing index between the two conditions, and then perform a t-test for all genes
146+ pvalues = []
147+ log_ratios_conditions = []
148+ diff_ratios_conditions = []
149+
150+ for gene in df_pausing_indices .index :
151+ # Collect the pausing indices for this gene, for each of the 2 conditions
152+ pausing_indices = {}
153+ for condition in conditions :
154+ pausing_indices [condition ]= []
155+ for replicate in replicates :
156+ pausing_indices [condition ].append (df_pausing_indices .loc [gene ,antibody + "_" + condition + "_R" + str (replicate )])
157+ # t-test between the two conditions
158+ pvalue = scipy .stats .ttest_ind (pausing_indices [conditions [0 ]],pausing_indices [conditions [1 ]])[1 ] # ,alternative="greater"
159+ if pvalue == pvalue :
160+ pvalues .append (pvalue )
161+
162+ # log ratio between the 2 conditions
163+ log_ratios_conditions .append (np .log (np .mean (pausing_indices [conditions [0 ]]) / np .mean (pausing_indices [conditions [1 ]])))
164+ diff_ratios_conditions .append (np .mean (pausing_indices [conditions [0 ]]) - np .mean (pausing_indices [conditions [1 ]]))
165+
166+ ratios_condition = {}
167+ for condition in conditions :
168+ ratios_condition [condition ] = []
169+ for replicate in replicates :
170+ ratios_replicate = []
171+ for gene in df_pausing_indices .index :
172+ ratios_replicate .append (df_pausing_indices .loc [gene ,antibody + "_" + condition + "_R" + str (replicate )])
173+ ratios_condition [condition ].append (np .mean (ratios_replicate ))
174+
175+ pvalue_samples = scipy .stats .ttest_ind (ratios_condition [conditions [0 ]],ratios_condition [conditions [1 ]])[1 ]
176+ print (ratios_condition )
177+ print ("p value:" + str (pvalue_samples ))
178+
179+ # use Fisher's method to combine the pvalues.
180+ print (scipy .stats .combine_pvalues (pvalues ))
181+ pvalue_combined = scipy .stats .combine_pvalues (pvalues )[1 ]
182+ print ("pvalue combined: " + str (pvalue_combined ))
183+
184+
185+ # t-test between the two conditions
186+ pvalue2 = scipy .stats .ttest_1samp (diff_ratios_conditions ,0 )[1 ] # alternative="greater"
187+ print ("pvalue diff: " + str (pvalue2 ))
188+ pvalue3 = scipy .stats .ttest_1samp (log_ratios_conditions ,0 )[1 ] # alternative="greater"
189+ print ("pvalue log ratio: " + str (pvalue3 ))
190+
191+ matplotlib .rcParams .update ({'font.size' : 20 })
192+ #plt.hist(log_ratios_conditions,bins=np.arange(-1.6,1.6,0.1),density=False)
193+ plt .hist (diff_ratios_conditions ,bins = np .arange (- 0.2 ,0.2 ,0.01 ),density = False )
194+ plt .xlabel ("Difference of pausing index between KD and control" )
195+ plt .ylabel ("Number of transcripts" )
196+ plt .title ("pvalue: " + "{:.2E}" .format (pvalue2 ))
197+ plt .savefig (os .path .join (working_dir ,antibody + "_ttest_" + type + ".svg" ),bbox_inches = "tight" )
198+ plt .cla ()
199+ plt .clf ()
200+ plt .close ('all' )
201+ series_diff_ratio = pd .Series (diff_ratios_conditions ,index = df_pausing_indices .index )
202+ series_diff_ratio .to_csv (os .path .join (working_dir ,antibody + "_diff_pausing-indices_" + type + ".tsv" ),sep = "\t " )
203+
204+ matplotlib .rcParams .update ({'font.size' : 20 })
205+ #plt.hist(log_ratios_conditions,bins=np.arange(-1.6,1.6,0.1),density=False)
206+ plt .hist (log_ratios_conditions ,bins = np .arange (- 0.2 ,0.2 ,0.01 ),density = False )
207+ plt .xlabel ("Log ratio of pausing index between KD and control" )
208+ plt .ylabel ("Number of transcripts" )
209+ plt .title ("pvalue: " + "{:.2E}" .format (pvalue3 ))
210+ plt .savefig (os .path .join (working_dir ,antibody + "_ttestLogRatio_" + type + ".svg" ),bbox_inches = "tight" )
211+ plt .cla ()
212+ plt .clf ()
213+ plt .close ('all' )
214+
215+
216+ plt .hist (pvalues )
217+ plt .xlabel ("pvalue" )
218+ plt .ylabel ("Number of transcripts" )
219+ plt .title ("Combined pvalue:" + "{:.2E}" .format (pvalue_combined ))
220+ plt .savefig (os .path .join (working_dir ,antibody + "_pvalues_" + type + ".svg" ),bbox_inches = "tight" )
221+ plt .cla ()
222+ plt .clf ()
223+ plt .close ('all' )
0 commit comments