Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
98 commits
Select commit Hold shift + click to select a range
0d07971
Update convert_ismapper_to_vcf.py
vrennie Aug 1, 2024
71ff6ab
Update convert_ismapper_to_vcf.py
vrennie Aug 1, 2024
5d7dbb6
Update convert_ismapper_to_vcf.py
vrennie Aug 1, 2024
b4d2586
Update merge__delly.nf
vrennie Aug 1, 2024
0fa99a1
Update convert_ismapper_to_vcf.py
vrennie Aug 1, 2024
2a02885
Update merge__delly.nf
vrennie Aug 1, 2024
5af42ad
Update convert_ismapper_to_vcf.py
vrennie Aug 1, 2024
e4881bc
Update merge__delly.nf
vrennie Aug 1, 2024
283f85e
Update convert_ismapper_to_vcf.py
vrennie Aug 1, 2024
57959e5
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
09f45df
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
cb608da
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
8fbbf26
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
31c7202
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
aba1998
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
1427c47
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
88c9d93
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
7e8afb7
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
3b1b2ba
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
879c43d
Update convert_ismapper_to_vcf.py
vrennie Aug 3, 2024
14c8ab3
Update convert_ismapper_to_vcf.py
vrennie Aug 6, 2024
f9ae941
Update convert_ismapper_to_vcf.py
vrennie Aug 6, 2024
1ff3924
Update convert_ismapper_to_vcf.py
vrennie Aug 6, 2024
9601a61
Update merge__delly.nf
vrennie Aug 6, 2024
9f6d220
Update merge__delly.nf
vrennie Aug 7, 2024
bbce102
Update merge__delly.nf
vrennie Aug 7, 2024
6ea8e53
Update merge__delly.nf
vrennie Aug 7, 2024
97f2e0a
Update merge__delly.nf
vrennie Aug 8, 2024
51fed9d
Update merge__delly.nf
vrennie Aug 8, 2024
ca61797
Update merge__delly.nf
vrennie Aug 8, 2024
804b5c6
Update merge__delly.nf
vrennie Aug 8, 2024
2cec405
initial docs website template [ci skip]
abhi18av Aug 23, 2024
d3f06e4
update the website config [ci skip
abhi18av Aug 23, 2024
c82b26c
add the baseline script from wilma [ci skip]
abhi18av Sep 12, 2024
9e3ca40
refactor wilma's script to a standard structure [ci skip]
abhi18av Sep 12, 2024
929eb48
reenable ismapper related code [ci skip]
abhi18av Sep 16, 2024
fcaf813
convert the bash script to a python one for better experience [ci skip]
abhi18av Sep 16, 2024
51933bc
iterim commit [ci skip]
abhi18av Sep 25, 2024
3cc4359
update the samplesheet validation script to use functions
abhi18av Oct 2, 2024
bc3ed7c
add the sanitization logic [ci skip]
abhi18av Oct 2, 2024
b3f01e1
refine the sample name extraction
abhi18av Oct 2, 2024
d0e833e
fix the escape characters
abhi18av Oct 2, 2024
d914e9b
fix escapes
abhi18av Oct 2, 2024
91b5dac
temp disable some analysis
abhi18av Oct 2, 2024
b3fc761
reenable the analysis
abhi18av Oct 2, 2024
b15bd68
isolate the bash script to a proper bin/script
abhi18av Oct 2, 2024
55f5603
rename file for the script template
abhi18av Oct 2, 2024
c8c36f7
update the process
abhi18av Oct 2, 2024
4874120
updated the python and bash scripts for eventual merge
abhi18av Oct 2, 2024
c3ee693
update the shell script as per Tim's comment
abhi18av Oct 2, 2024
57620b5
prepend study name
abhi18av Oct 2, 2024
ddde38d
make the colname explicit
abhi18av Oct 2, 2024
87e04f7
temp
abhi18av Oct 2, 2024
2e13608
remove -W since we are using an older version of bcftools
abhi18av Oct 2, 2024
09c1143
push the updated merge script, bcftools native gzip
abhi18av Oct 2, 2024
ea02342
iterate on patching structural variants [ci skip]
abhi18av Oct 7, 2024
c904f01
dev iteration [ci skip]
abhi18av Oct 7, 2024
b934cec
dev iteration [ci skip]
abhi18av Oct 7, 2024
2ab3e7e
dev iteration
abhi18av Oct 7, 2024
11dda17
dev iteration
abhi18av Oct 7, 2024
53114b1
dev iteration
abhi18av Oct 7, 2024
d3fbe40
transition to python script
abhi18av Oct 7, 2024
043fb2e
tweak the filenames for interim output
abhi18av Oct 7, 2024
4f29c67
dev iteration [ci skip]
abhi18av Oct 7, 2024
1ba7528
update container defs
abhi18av Oct 7, 2024
cfa44a5
update container
abhi18av Oct 7, 2024
d4c1cb5
update the parameters of python script
abhi18av Oct 7, 2024
1978997
dev iteration
abhi18av Oct 7, 2024
7480b7e
update the container
abhi18av Oct 7, 2024
62b6a3d
rely upon updated tbprofiler containers for patch script
abhi18av Oct 8, 2024
5762673
temp
abhi18av Oct 8, 2024
67513f2
finalize the patch script
abhi18av Oct 8, 2024
281e0c1
dev iteration [ci skip]
abhi18av Oct 8, 2024
2834f1a
update the patching script
abhi18av Oct 8, 2024
74f4051
revert the changes to magma-env-1
abhi18av Oct 8, 2024
b7a0734
initial docs website template [ci skip]
abhi18av Aug 23, 2024
3fc0d83
update the website config [ci skip
abhi18av Aug 23, 2024
b1cedcb
handle type-casting of fractional values into boolean one [ci skip]
abhi18av Sep 16, 2024
c84dffc
add a license header for all source files
abhi18av Sep 25, 2024
529201b
add licenseheaders for all config files as well
abhi18av Oct 1, 2024
c844bab
handle type-casting of fractional values into boolean one [ci skip]
abhi18av Sep 16, 2024
f3899ea
Update IS_Queries_Mtb.multi.fasta
mdediegofuertes Oct 9, 2024
544cd78
fix indentation
abhi18av Oct 9, 2024
4d0d285
fix commented code
abhi18av Oct 9, 2024
b4b8eaa
move from delly bcf to vcf
abhi18av Oct 11, 2024
a4cef95
forward the patched json output from structural variants
abhi18av Oct 14, 2024
1c8540b
add explicit svmethod for ismapper
abhi18av Oct 21, 2024
a178040
disable the extra view command
abhi18av Oct 21, 2024
e72dc6a
Update patch_tbprofiler_sv_output.py
mdediegofuertes Oct 29, 2024
3517600
Update patch_tbprofiler_sv_output.py
mdediegofuertes Oct 29, 2024
60fea56
Update patch_tbprofiler_sv_output.py
mdediegofuertes Oct 29, 2024
7adce66
Update patch_tbprofiler_sv_output.py
mdediegofuertes Oct 29, 2024
13723d1
Update patch_tbprofiler_sv_output.py
mdediegofuertes Oct 29, 2024
a4c8c93
Update patch_tbprofiler_sv_output.py
mdediegofuertes Oct 31, 2024
1255e17
update changelog for v2.0.0
abhi18av Apr 3, 2025
4b5a939
tweak gitignore [ci skip]
abhi18av Jun 21, 2025
a995988
Merge branch 'master' into develop
abhi18av Jun 21, 2025
8ada9b2
Update sample_stats.py (#236)
vrennie Sep 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ containers/**/*yml
samplesheet.csv
magma.sh
.Rproj.user
/docs/_site
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@

7. Updates to the names of parameters for triggering partial/optional workflows

8. Update the MULTIQC report


## v1.1.1

Expand Down
47 changes: 47 additions & 0 deletions bin/bcftools_merge__delly.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash

# Create temporary directory
tmp_dir="interim"

mkdir $tmp_dir

# Extract unique sample prefixes from filenames
prefixes=( $(ls *.vcf.gz | xargs -n 1 basename | cut -d '.' -f 1,2 | sort -u) )





# Process each sample prefix
for prefix in "${prefixes[@]}"; do
# Find related files for the prefix
files=( $(find . -maxdepth 1 -name "${prefix}.*.vcf.gz" ! -name "*.csi" ) )

echo $files

if [[ ${#files[@]} -gt 0 ]]; then
# Concatenate files
concat_file="${tmp_dir}/${prefix}.concat.vcf.gz"
bcftools concat -o "$concat_file" -O v ${files[@]}
#bcftools view "$concat_file" -Oz -o "$concat_file"

# Sort and index the concatenated file
sorted_file="${tmp_dir}/${prefix}.concat.sorted.vcf.gz"
bcftools sort "$concat_file" -Oz -o "$sorted_file"
bcftools index "$sorted_file"

# Add sorted file to merged list
concat_files+=("$sorted_file")
fi
done

# Merge concatenated files (if any)
if [[ ${#concat_files[@]} -gt 0 ]]; then
bcftools merge "${concat_files[@]}" -o joint.delly.vcf
bgzip joint.delly.vcf
bcftools index joint.delly.vcf.gz
fi

# NOTE For now commenting this for debugging.
# Clean up temporary directory
#rm -rf "$tmp_dir"
94 changes: 71 additions & 23 deletions bin/convert_ismapper_to_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,39 @@
import argparse
import csv
import glob
import os
from Bio import SeqIO

# Define the VCF header
vcf_header = """##fileformat=VCFv4.2
##source=ISMapper
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=LowQual,Description="Poor quality and insufficient number of PEs and SRs.">
##ALT=<ID=DEL,Description="Deletion">
##ALT=<ID=DUP,Description="Duplication">
##ALT=<ID=INV,Description="Inversion">
##ALT=<ID=BND,Description="Breakend">
##ALT=<ID=INS,Description="Insertion">
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="PE confidence interval around END">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="PE confidence interval around POS">
##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for POS2 coordinate in case of an inter-chromosomal translocation">
##INFO=<ID=POS2,Number=1,Type=Integer,Description="Genomic position for CHR2 in case of an inter-chromosomal translocation">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
##INFO=<ID=PE,Number=1,Type=Integer,Description="Paired-end support of the structural variant">
##INFO=<ID=MAPQ,Number=1,Type=Integer,Description="Median mapping quality of paired-ends">
##INFO=<ID=SRMAPQ,Number=1,Type=Integer,Description="Median mapping quality of split-reads">
##INFO=<ID=SR,Number=1,Type=Integer,Description="Split-read support">
##INFO=<ID=SRQ,Number=1,Type=Float,Description="Split-read consensus alignment quality">
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Split-read consensus sequence">
##INFO=<ID=CE,Number=1,Type=Float,Description="Consensus sequence entropy">
##INFO=<ID=CT,Number=1,Type=String,Description="Paired-end signature induced connection type">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Insertion length for SVTYPE=INS">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=PRECISE,Number=0,Type=Flag,Description="Precise structural variation">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SVMETHOD,Number=1,Type=String,Description="Type of approach used to detect SV">
##INFO=<ID=INSLEN,Number=1,Type=Integer,Description="Predicted length of the insertion">
##INFO=<ID=HOMLEN,Number=1,Type=Integer,Description="Predicted microhomology length using a max. edit distance of 2">
##INFO=<ID=Orientation,Number=1,Type=String,Description="Orientation of the transposable element">
##INFO=<ID=Gap,Number=1,Type=Integer,Description="Gap between sequences">
##INFO=<ID=Call,Number=1,Type=String,Description="Call information">
Expand All @@ -48,12 +75,21 @@
##INFO=<ID=Right_strand,Number=1,Type=String,Description="Strand of the right gene">
##INFO=<ID=Right_distance,Number=1,Type=Integer,Description="Distance to the right gene">
##INFO=<ID=Gene_interruption,Number=1,Type=String,Description="Gene interruption status">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##INFO=<ID=Insertion,Number=1,Type=String,Description="Details of the insertion">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Log10-scaled genotype likelihoods for RR,RA,AA genotypes">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=FT,Number=1,Type=String,Description="Per-sample genotype filter">
##FORMAT=<ID=RC,Number=1,Type=Integer,Description="Raw high-quality read counts or base counts for the SV">
##FORMAT=<ID=RCL,Number=1,Type=Integer,Description="Raw high-quality read counts or base counts for the left control region">
##FORMAT=<ID=RCR,Number=1,Type=Integer,Description="Raw high-quality read counts or base counts for the right control region">
##FORMAT=<ID=RDCN,Number=1,Type=Integer,Description="Read-depth based copy-number estimate for autosomal sites">
##FORMAT=<ID=DR,Number=1,Type=Integer,Description="# high-quality reference pairs">
##FORMAT=<ID=DV,Number=1,Type=Integer,Description="# high-quality variant pairs">
##FORMAT=<ID=RR,Number=1,Type=Integer,Description="# high-quality reference junction reads">
##FORMAT=<ID=RV,Number=1,Type=Integer,Description="# high-quality variant junction reads">
##reference=NC-000962-3-H37Rv.fa
##contig=<ID=NC-000962-3-H37Rv,length=4411532>
"""

# Function to read the reference genome
Expand All @@ -76,40 +112,34 @@ def read_transposable_elements(query_file):
te_name = record.id
te_info[te_name] = len(record.seq)

# NOTE: Renable in case we need to dump this on disk.
# with open("temp_transposable_elements.csv", mode='w', newline='') as file:
# writer = csv.writer(file)
# writer.writerow(["name", "length"])
# for te_name, te_length in te_info.items():
# writer.writerow([te_name, te_length])
return te_info

# Function to convert ISMapper data to VCF format
def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_info):
def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_info, ismapper_column_name):

is_mapper_txt_file = glob.glob(is_mapper_dir + "*.txt")[0]
sample_name = ismapper_column_name

with open(is_mapper_txt_file, 'r') as infile_is_mapper, open(vcf_file, 'w') as outfile:
# Write the VCF header
outfile.write(vcf_header)
# Write the VCF header with the correct sample name
outfile.write(vcf_header + f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{sample_name}\n")

# Read the ISMapper TSV file
reader_is_mapper = csv.DictReader(infile_is_mapper, delimiter='\t')

for row in reader_is_mapper:
chrom = "NC-000962-3-H37Rv" # Using the specified chromosome
pos = int(row['x']) # Convert position to integer
end = int(row['y']) # Convert end position to integer
region_id = row['region']
ref = reference_sequences[chrom][pos - 1] # Extract the reference allele from the reference sequence
orientation = row['orientation']
# FIXME Hard-code the name of this specific element
te_name = 'IS6110'
te_length = te_info.get(te_name, 'NA')
alt = f"{ref}[<{te_name},{orientation}>:{te_length}[" # Use transposable element, orientation, and its length
qual = '.'
qual = '1000' # Set QUAL to 1000
filter_status = 'PASS'
info = (
f"SVTYPE=BND;"
f"SVTYPE=INS;SVMETHOD=ISMAPPER;"
f"Orientation={orientation};"
f"Gap={row['gap']};"
f"Call={row['call']};"
Expand All @@ -123,13 +153,29 @@ def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_in
f"Right_description={row['right_description']};"
f"Right_strand={row['right_strand']};"
f"Right_distance={row['right_distance']};"
f"Gene_interruption={row['gene_interruption']}"
f"Gene_interruption={row['gene_interruption']};"
f"SVLEN={te_length};"
f"END={end};"
f"Insertion=<{te_name},{orientation}>:{te_length}"
)
format_field = "GT:AD:DP:GQ:PL"
sample_field = "1:10,10:10:99:1800"
format_field = "GT:GL:GQ:FT:RCL:RC:RCR:RDCN:DR:DV:RR:RV"
gl_value = "-665.375,-56.2674,0" # Example values for GL
gq_value = "10000" # Genotype Quality
ft_value = "PASS" # Per-sample genotype filter
rcl_value = "60468" # Raw high-quality read counts or base counts for the left control region
rc_value = "122933" # Raw high-quality read counts or base counts for the SV
rcr_value = "62465" # Raw high-quality read counts or base counts for the right control region
rdcn_value = "2" # Read-depth based copy-number estimate for autosomal sites
dr_value = "0" # High-quality reference pairs
dv_value = "0" # High-quality variant pairs
rr_value = "0" # High-quality reference junction reads
rv_value = "187" # High-quality variant junction reads
gt_value = "1/1" # Genotype for homozygous alternate allele

sample_field = f"{gt_value}:{gl_value}:{gq_value}:{ft_value}:{rcl_value}:{rc_value}:{rcr_value}:{rdcn_value}:{dr_value}:{dv_value}:{rr_value}:{rv_value}"

# Write the VCF entry
vcf_entry = f"{chrom}\t{pos}\t{region_id}\t{ref}\t{alt}\t{qual}\t{filter_status}\t{info}\t{format_field}\t{sample_field}\n"
vcf_entry = f"{chrom}\t{pos}\t{region_id}\t{ref}\t<INS>\t{qual}\t{filter_status}\t{info}\t{format_field}\t{sample_field}\n"
outfile.write(vcf_entry)

if __name__ == '__main__':
Expand All @@ -138,6 +184,7 @@ def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_in
parser.add_argument('--reference_file', required=True, help='Path to the reference genome file.')
parser.add_argument('--query_file', required=True, help='Path to the transposable elements multifasta file.')
parser.add_argument('--output_vcf_file', required=True, help='Path to the output VCF file.')
parser.add_argument('--ismapper_column_name', required=True, help='Explicit name of the ISMAPPER column.')

# Step 3: Parse the command-line arguments
args = parser.parse_args()
Expand All @@ -147,6 +194,7 @@ def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_in
vcf_file = args.output_vcf_file
reference_file = args.reference_file
query_file = args.query_file
ismapper_column_name = args.ismapper_column_name

# Read the reference genome sequence
reference_sequences, _ = read_reference_genome(reference_file)
Expand All @@ -155,4 +203,4 @@ def convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, te_in
query_info = read_transposable_elements(query_file)

# Run the conversion function
convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, query_info)
convert_is_mapper_to_vcf(is_mapper_dir, vcf_file, reference_sequences, query_info, ismapper_column_name)
4 changes: 3 additions & 1 deletion bin/generate_merged_cohort_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import argparse
import pandas as pd


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Summarize the CALL_WF and MINOR_VARIANTS_ANALYSIS_WF analysis results')
parser.add_argument('--relabundance_approved_tsv', default="approved_samples.relabundance.tsv", metavar='relabundance_approved_tsv', type=str, help='File enlisting the approved samples from MINOR_VARIANTS_ANALYSIS_WF')
Expand Down Expand Up @@ -70,6 +71,7 @@
df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'] = df_final_cohort_stats['BREADTH_OF_COVERAGE_THRESHOLD_MET'].fillna(0).astype('Int64')
df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'] = df_final_cohort_stats['RELABUNDANCE_THRESHOLD_MET'].fillna(0).astype('Int64')


# Derive the final threshold using Boolean operations
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = (
df_final_cohort_stats['MAPPED_NTM_FRACTION_16S_THRESHOLD_MET'].apply(lambda x: bool(x) if pd.notna(x) else False) &
Expand All @@ -80,4 +82,4 @@
df_final_cohort_stats['ALL_THRESHOLDS_MET'] = df_final_cohort_stats['ALL_THRESHOLDS_MET'].replace({True: 1, False: 0})

# Write the final dataframe to file
df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
df_final_cohort_stats.to_csv(args['output_file'], sep="\t")
Loading