Skip to content

Commit 4232565

Browse files
committed
qc: Added SNV catalogue draft #TASK-6766
1 parent 5daa94c commit 4232565

4 files changed

Lines changed: 1197 additions & 50 deletions

File tree

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
#!/usr/bin/env python3
2+
3+
import sys
4+
import os
5+
import logging
6+
import json
7+
import gzip
8+
import re
9+
10+
import pysam
11+
12+
from utils import create_output_dir, execute_bash_command, bgzip_vcf, get_reverse_complement, generate_results_json
13+
14+
LOGGER = logging.getLogger('variant_qc_logger')
15+
16+
REFERENCE_GENOME_FASTA_FNAME = 'Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz'
17+
18+
19+
class MutationalCatalogueAnalysis:
20+
def __init__(self, vcf_file, resource_dir, config_json, output_dir, sample_id):
21+
"""Create output dir
22+
23+
:param str vcf_file: VCF input file path
24+
:param str resource_dir: Output directory path for resources
25+
:param dict config_json: Configuration JSON
26+
:param str output_dir: Output directory path for this analysis
27+
:param str sample_id: Sample ID
28+
"""
29+
self.vcf_file = vcf_file
30+
self.resource_dir = resource_dir
31+
self.config_json = config_json
32+
self.output_dir = output_dir
33+
self.sample_id = sample_id
34+
35+
# Getting reference genome file path
36+
self.ref_genome_fasta_fpath = os.path.join(resource_dir, REFERENCE_GENOME_FASTA_FNAME)
37+
38+
# Getting variant type from query
39+
self.ms_type = self.get_ms_type()
40+
41+
# Intermediate files
42+
self.snv_vcf_fpath = os.path.join(self.output_dir,
43+
'SNV-' + re.sub('\.gz$', '', os.path.basename(self.vcf_file)) + '.bgz')
44+
self.sv_vcf_fpath = os.path.join(self.output_dir,
45+
'SV-' + re.sub('\.gz$', '', os.path.basename(self.vcf_file)) + '.bgz')
46+
self.snv_genome_context_fpath = os.path.join(self.output_dir,
47+
'OPENCGA_{}_GRCh38_genome_context.csv'.format(self.sample_id))
48+
49+
# CONFIG
50+
# "msId": "",
51+
# "msDescription": "",
52+
# "msQuery": "", # "query": "{\"fileData\":\"" + INDIVIDUALS[i] + ".annot.muts.caveman.vcf.gz:FILTER=PASS;CLPM<=0;ASMD>=140\", \"region\": \"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,X,Y\", \"type\":\"SNV\"}",
53+
#
54+
# "msFitId": "",
55+
# "msFitMethod": "",
56+
# "msFitNBoot": 0,
57+
# "msFitSigVersion": "",
58+
# "msFitOrgan": "",
59+
# "msFitThresholdPerc": {},
60+
# "msFitThresholdPval": {},
61+
# "msFitMaxRareSigs": 0,
62+
# "msFitSignaturesFile": "",
63+
# "msFitRareSignaturesFile": "",
64+
65+
def get_ms_type(self):
66+
ms_type = None
67+
if ('signatures' in self.config_json and self.config_json['signatures']
68+
and 'msQuery' in self.config_json['signatures'] and self.config_json['signatures']['msQuery']
69+
and 'type' in self.config_json['signatures']['msQuery'] and self.config_json['signatures']['msQuery']['type']):
70+
ms_type = self.config_json['signatures']['msQuery']['type']
71+
return ms_type
72+
73+
@staticmethod
74+
def vcf_filter_iterator(vcf_fpath, opencga_query, header=True):
75+
"""Filter a VCF given an opencga query
76+
77+
:param str vcf_fpath: VCF input file path
78+
:param dict opencga_query: Output directory path for resources
79+
:param bool header: Include VCF header if true
80+
:returns: VCF variants
81+
"""
82+
# BGZIPping VCF (pysam requirement)
83+
vcf_fpath = bgzip_vcf(vcf_fpath)
84+
85+
# Translating variant types
86+
type_ = []
87+
if 'type' in opencga_query:
88+
for t in opencga_query['type'].split(','):
89+
if t == 'SNV':
90+
type_ += ['Sub']
91+
if t == 'INDEL':
92+
type_ += ['Del', 'Ins']
93+
if t == 'SV':
94+
type_ += ['BND', 'DELETION', 'BREAKEND', 'DUPLICATION', 'TANDEM_DUPLICATION', 'INVERSION', 'TRANSLOCATION']
95+
if t == 'CNV':
96+
type_ += ['CNV', 'COPY_NUMBER']
97+
98+
# Opening VCF file (BGZIP VCF file)
99+
pysam_vcf_fhand = pysam.VariantFile(vcf_fpath)
100+
101+
# FILTERING
102+
if header:
103+
yield pysam_vcf_fhand.header
104+
for record in pysam_vcf_fhand:
105+
# Getting all VCF types
106+
vcf_types = list(filter(None, [record.info.get('VT'), record.info.get('EXT_SVTYPE'), record.info.get('SVTYPE')]))
107+
if not set(vcf_types).intersection(type_):
108+
continue
109+
110+
yield record
111+
112+
def create_snv_genome_context_file(self):
113+
"""Create a genome context file that contains all SNVs and their flanking bases
114+
115+
e.g.
116+
1:10026:A:C TAA
117+
1:10120:T:A CTA
118+
1:10126:T:A CTA
119+
"""
120+
121+
# TODO Is this already provided and we just have to take it from a folder?
122+
# Create VCF file with ALL SNVs
123+
snv_vcf_fhand = open(self.snv_vcf_fpath[:-4], 'w')
124+
for variant in self.vcf_filter_iterator(vcf_fpath=self.vcf_file, opencga_query={"type": "SNV"}):
125+
snv_vcf_fhand.write(str(variant))
126+
snv_vcf_fhand.close()
127+
self.snv_vcf_fpath = bgzip_vcf(self.snv_vcf_fpath[:-4], delete_original=True) # BGZIPping VCF (pysam req)
128+
129+
# Opening SNV VCF file (BGZIP VCF file)
130+
pysam_snv_vcf_fhand = pysam.VariantFile(self.snv_vcf_fpath)
131+
132+
# Opening reference genome FASTA file
133+
ref_genome = pysam.FastaFile(self.ref_genome_fasta_fpath)
134+
135+
# Creating genome context file to write
136+
snv_genome_context_fhand = open(self.snv_genome_context_fpath, 'w')
137+
138+
# Writing context
139+
flank = 1 # How many bases the variant should be flanked
140+
for record in pysam_snv_vcf_fhand:
141+
# The position in the vcf file is 1-based, but pysam's fetch() expects 0-base coordinate
142+
triplet = ref_genome.fetch(record.chrom, record.pos - 1 - flank, record.pos - 1 + len(record.ref) + flank)
143+
144+
# Writing out contexts as "VARID\tTRIPLET" (1:10026:A:C\tTAA) :
145+
snv_genome_context_fhand.write(
146+
'{}:{}:{}:{}\t{}\n'.format(record.chrom, record.pos, record.ref, record.alts[0], triplet)
147+
)
148+
snv_genome_context_fhand.close()
149+
150+
def create_snv_signature_catalogue(self):
151+
152+
# Getting variant contexts
153+
snv_genome_context_fhand = open(self.snv_genome_context_fpath, 'r')
154+
snv_contexts = {line.split()[0]: line.split()[1] for line in snv_genome_context_fhand}
155+
snv_genome_context_fhand.close()
156+
157+
# TODO Filter SNV VCF to get queried SNVs - Is this already provided and we just have to take it from a folder?
158+
159+
# Opening SNV VCF file (BGZIP VCF file)
160+
pysam_snv_vcf_fhand = pysam.VariantFile(self.snv_vcf_fpath)
161+
162+
# Counting SNV contexts
163+
counts = {}
164+
for record in pysam_snv_vcf_fhand:
165+
var_id = ':'.join(map(str, [record.chrom, record.pos, record.ref, ','.join(list(record.alts))]))
166+
167+
# Creating key "first_flanking_base[REF>ALT]second_flanking_base". e.g. "A[C>A]T"
168+
# Reverse complement contexts whose first flanking base is not "C" or "T"
169+
# Main groups in SNV mutational profiles: C>A, C>G, C>T, T>A, T>C, T>G
170+
if record.ref not in ['C', 'T']:
171+
context = get_reverse_complement(snv_contexts[var_id])
172+
alt = get_reverse_complement(var_id.split(':')[3])
173+
else:
174+
context = snv_contexts[var_id]
175+
alt = var_id.split(':')[3]
176+
context_key = '{}[{}>{}]{}'.format(context[0], context[1], alt, context[2])
177+
counts[context_key] = counts.get(context_key, 0) + 1
178+
179+
# Creating results
180+
results = {'signatures': [{'counts': [{'context': k, 'total': counts[k]} for k in counts]}]}
181+
generate_results_json(results=results, outdir_path=self.output_dir)
182+
183+
def create_sv_clustering(self, sv_vcf_fpath):
184+
"""Executes R script sv_clustering.R to generate clustering for SVs
185+
CMD: Rscript sv_clustering.R ./in.bedpe ./out.bedpe
186+
187+
Input file:
188+
chrom1 start1 end1 chrom2 start2 end2 sample
189+
1 100 100 1 200 200 s1
190+
2 100 100 1 200 200 s1
191+
2 200 200 1 300 300 s1
192+
193+
Output file:
194+
chrom1 start1 end1 chrom2 start2 end2 sample id is.clustered
195+
1 100 100 1 200 200 s1 1 FALSE
196+
2 100 100 1 200 200 s1 2 FALSE
197+
2 200 200 1 300 300 s1 3 FALSE
198+
199+
:param str sv_vcf_fpath: SV VCF input file path
200+
:returns: The created output dir file
201+
"""
202+
203+
# Opening SV VCF file (BGZIP VCF file)
204+
pysam_sv_vcf_fhand = pysam.VariantFile(sv_vcf_fpath)
205+
206+
# Writing input file
207+
in_bedpe_fpath = os.path.join(self.output_dir, 'in.bedpe')
208+
in_bedpe_fhand = open(in_bedpe_fpath, 'w')
209+
in_bedpe_fhand.write('\t'.join(['chrom1', 'start1', 'end1', 'chrom2', 'start2', 'end2', 'sample']) + '\n')
210+
mate_ids = []
211+
for record in pysam_sv_vcf_fhand:
212+
mate_ids.append(record.info.get('MATEID'))
213+
if record.info.get('VCF_ID') in mate_ids: # Skip mates of already visited SVs
214+
continue
215+
chrom2, pos2 = re.findall('.*[\[\]](.+):(.+)[\[\]].*', record.alts[0])[0]
216+
line = '\t'.join(map(str, [record.chrom, record.pos, record.pos, chrom2, pos2, pos2, self.sample_id]))
217+
in_bedpe_fhand.write(line + '\n')
218+
in_bedpe_fhand.close()
219+
220+
# Executing SV clustering
221+
r_script = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'sv_clustering.R')
222+
out_bedpe_fpath = os.path.join(self.output_dir, 'out.bedpe')
223+
cmd = 'Rscript {} {} {}'.format(r_script, in_bedpe_fpath, out_bedpe_fpath)
224+
execute_bash_command(cmd)
225+
226+
return out_bedpe_fpath
227+
228+
def create_sv_clustered_context_file(self):
229+
230+
# TODO Is this already provided and we just have to take it from a folder?
231+
# Create VCF file with ALL SVs
232+
sv_vcf_fhand = open(self.sv_vcf_fpath[:-4], 'w')
233+
for variant in self.vcf_filter_iterator(vcf_fpath=self.vcf_file, opencga_query={"type": "SV"}):
234+
sv_vcf_fhand.write(str(variant))
235+
sv_vcf_fhand.close()
236+
self.sv_vcf_fpath = bgzip_vcf(self.sv_vcf_fpath[:-4], delete_original=True) # BGZIPping VCF (pysam req)
237+
238+
# Generating clustering for SVs
239+
out_bedpe_fpath = self.create_sv_clustering(self.sv_vcf_fpath)
240+
241+
# Counting SV contexts
242+
# https://cancer.sanger.ac.uk/signatures/sv/sv1/
243+
# TODO
244+
245+
246+
def run(self):
247+
# Creating mutational signature catalogue
248+
if self.ms_type == 'SNV':
249+
self.create_snv_genome_context_file()
250+
self.create_snv_signature_catalogue()
251+
elif self.ms_type == 'SV':
252+
self.create_sv_clustered_context_file()
253+
# self.create_sv_signature_catalogue()
254+
pass
255+
else:
256+
msg = 'Mutational signature for type "{}" not implemented'.format(self.ms_type)
257+
raise ValueError(msg)
258+
259+
260+
261+

opencga-app/app/analysis/variant-qc/sample_qc/sample_qc.py

Lines changed: 15 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import json
77

88
from utils import create_output_dir, execute_bash_command
9+
from sample_qc.mutational_catalogue import MutationalCatalogueAnalysis
910

1011
LOGGER = logging.getLogger('variant_qc_logger')
1112

@@ -40,23 +41,30 @@ def __init__(self, vcf_file, info_file, bam_file, config, resource_dir, output_p
4041

4142
def run(self):
4243

44+
# TODO check if sample is somatic
45+
4346
# Genome plot
4447
if 'skip' not in self.config_json or 'genomePlot' not in self.config_json['skip']:
4548
self.create_genome_plot()
4649

50+
# Mutational signatures
51+
if 'skip' not in self.config_json or 'mutationalCatalog' not in self.config_json['skip']:
52+
self.create_mutational_catalogue()
4753

4854

49-
# self.bcftools_stats(vcf_file=self.vcf_file)
50-
51-
# missingness()
52-
# heterozygosity ()
53-
# roh()
54-
# upd()
55-
5655
# Return results
5756
# ... # TODO return results
5857
pass
5958

59+
def create_mutational_catalogue(self):
60+
# Create output dir for this analysis
61+
output_dir = create_output_dir([self.output_parent_dir, 'mutational_catalogue'])
62+
63+
# Execute analysis
64+
mca = MutationalCatalogueAnalysis(self.vcf_file, self.resource_dir, self.config_json, output_dir, self.id_)
65+
mca.run()
66+
67+
6068
def get_genome_plot_vcfs(self, output_dir, gp_config_json):
6169

6270
# Creating VCF files for each variant type
@@ -111,39 +119,3 @@ def create_genome_plot(self):
111119
)
112120

113121
return_code, stdout, stderr = execute_bash_command(cmd)
114-
115-
116-
def bcftools_stats(self, vcf_file):
117-
"""
118-
Calculates VCF stats using BCFTools
119-
:param str vcf_file: VCF file to get stats from
120-
:return:
121-
"""
122-
# Creating output dir for bcftools
123-
output_dir = create_output_dir([self.output_parent_dir, 'bcftools'])
124-
125-
# Running bcftools
126-
cmd_bcftools = 'bcftools stats -v ' + vcf_file + ' > ' + os.path.join(output_dir, 'bcftools_stats.txt')
127-
execute_bash_command(cmd=cmd_bcftools)
128-
LOGGER.info("BCFTools stats calculated successfully for {file}".format(file=vcf_file))
129-
130-
# Plot stats using plot-vcfstats
131-
cmd_vcfstats = 'plot-vcfstats -p ' + output_dir + '/bcftools_stats_plots ' + output_dir + '/bcftools_stats.txt'
132-
execute_bash_command(cmd=cmd_vcfstats)
133-
LOGGER.info("Plot stats using plot-vcfstats executed successfully for {file}".format(file=vcf_file))
134-
135-
# TODO: Future implementation for vcf stats plots
136-
#plot_bcftools_stats(file=output_dir + '/bcftools_stats.txt', prefix="stats", output=output_dir)
137-
138-
139-
if __name__ == '__main__':
140-
vcf_file = "/home/mbleda/Downloads/KFSHRC/CGMQ2022-02850.vcf.gz"
141-
info_file = ""
142-
bam_file = ""
143-
config = ""
144-
output_parent_dir = "/tmp/qc_tests/"
145-
sample_ids = []
146-
id_ = ""
147-
se = SampleQCExecutor(vcf_file=vcf_file, info_file=info_file, bam_file=bam_file, config=config,
148-
output_parent_dir=output_parent_dir, sample_ids=sample_ids, id_=id_)
149-
sys.exit(se.run())

0 commit comments

Comments
 (0)