Skip to content

Commit 2ba6039

Browse files
committed
2025-02-28
1 parent 635e238 commit 2ba6039

File tree

4 files changed

+54
-13
lines changed

4 files changed

+54
-13
lines changed

bin/R/compile-sequencing-statistics.R

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ library(tidyverse)
66
parser <- ArgumentParser(description = "Script to process MTBseq and TBProfiler data")
77

88
# Define arguments
9-
parser$add_argument("--minimum_coverage", required=TRUE, type="integer", help="Minimum coverage threshold")
109
parser$add_argument("--dictionary_path", default=NULL, help="Path to R dictioanry for renaming files")
1110
parser$add_argument("--runID", required=TRUE, help="RunID")
1211

@@ -20,7 +19,6 @@ mtbseq_statistics <- args$mtbseq_statistics
2019
mtbseq_classification <- args$mtbseq_classification
2120
tbprofiler_tbdb <- args$tbprofiler_tbdb
2221
tbprofiler_who <- args$tbprofiler_who
23-
minimum_coverage <- args$minimum_coverage
2422
dictionary_path <- args$dictionary_path
2523
lineage_fractions <- args$lineage_fractions
2624
runID <- args$runID
@@ -73,12 +71,9 @@ resistance_profiles_WHO.df <- dictionary_rename(df = tbprofiler_who,
7371
"/dict/resistance_profiles_WHO.csv"))
7472

7573
# Create the list of genomes for pairwise analysis
76-
7774
pairwise_analysis.df <- full.df.final |>
78-
filter(`Unambiguous Coverage median` >= minimum_coverage) |>
79-
filter(infection_type == "Clonal") |> #only clonal genomes
80-
filter(sub_lineage != "NA") |> # no unclassified genomes
81-
select(SampleID=FullID,main_lineage,sub_lineage)
75+
select(SampleID=FullID,main_lineage,sub_lineage) |>
76+
filter(main_lineage != "NA" & !str_detect(main_lineage, ";") & !str_detect(sub_lineage, ";"))
8277

8378
# export all the ouputs (broken!)
8479
write.csv2(sequencing_summary.df,
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
Col,Name,Description,Minimum value
2+
X1,Date,The date of MTBseq execution,NA
3+
X2,SampleID,Sample ID (XX-XX),NA
4+
X3,LibraryID,Libeary ID (_XX),NA
5+
X4,FullID, Complete SampleID_LibraryID,NA
6+
X5,Total Reads,The total amount of sequenced reads,500000
7+
X6,Mapped Reads,Number of reads mapped to the reference genome,NA
8+
X7,% Mapped Reads,The percentage of reads mapped to the reference genome,NA
9+
X8,Genome Size,The size of the reference genome,NA
10+
X9,Genome GC,The GC content of the reference genome,NA
11+
X10,(Any) Total Bases,Number of nt the reference genome covered by reads,NA
12+
X11,% (Any) Total Bases,Percentage of the reference genome covered by reads,NA
13+
X12,(Any) GC-Content,GC content of the reference genome covered by reads,NA
14+
X13,(Any) Coverage mean,Mean coverage depth,NA
15+
X14,(Any) Coverage medianm,Median coverage depth,NA
16+
X15,(Unambiguous) Total Bases,Number of nt of the reference genome covered unambiguously,NA
17+
X16,% (Unambiguous) Total Bases,Percentage of the reference genome covered unambiguously,0.95
18+
X17,(Unambiguous) GC-Content,GC content of the reference genome covered unambiguously,NA
19+
X18,(Unambiguous) Coverage mean,Mean coverage depth of unambiguously covered positions,NA
20+
X19,(Unambiguous) Coverage median,Median coverage depth of unambiguously covered positions,50
21+
X20,SNPs,Number of detected SNPs,NA
22+
X21,Deletions,Number of detected deletions,NA
23+
X22,Insertions,Number of detected insertions,NA
24+
X23,Uncovered,Positions of the reference genome not covered by a read,NA
25+
X24,Substitutions (Including Stop Codons),Number of substitutions within genes,NA

modules/local/filtering/compile-sequencing-stats/main.nf

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
process COMPILE_SEQUENCING_STATS {
22

3+
/*
4+
In this module theWhoverall sequencing statistics for the BBDD are
5+
calculated
6+
TODO: Fix the Rscript compile-sequencing-statistics.R as it is generating
7+
and incorrect pairwise_analysis.list.csv
8+
*/
9+
310
conda params.r_stats_env
411

512
publishDir "${params.outdir}/bbdd/results/", mode: 'copy'
@@ -34,8 +41,9 @@ process COMPILE_SEQUENCING_STATS {
3441
3542
# Generate summary statistics and create the sampleID,lineage df for
3643
## creating into a channel TODO: need to fix this script in generating the output for tuplec creation
44+
# the production of the pairwise_analysis.list.csv doest work
3745
Rscript ${params.r_script_dir}/compile-sequencing-statistics.R \\
38-
--minimum_coverage ${params.mtbseq_min_cov} \\
46+
--minimum_coverage ${params.mtbseq_min_depth} \\
3947
--runID ${runID} \\
4048
--dictionary_path ${params.r_script_dir}
4149
@@ -47,10 +55,16 @@ process COMPILE_SEQUENCING_STATS {
4755
mv tmp.${runID}.sequencing_summary.csv ${runID}.sequencing_summary.csv
4856
4957
# Create the file to go to the tuple seperation
50-
awk -F "\t" '{ if ( \$14 > ${params.mtbseq_min_cov} ) print \$4 }' Mapping_and_Variant_Statistics.tab | sort | uniq > min.qual.genomes
58+
Rscript -e 'library(tidyverse)
59+
df <- read_delim("Mapping_and_Variant_Statistics.tab", delim = "\t", col_names = FALSE) |>
60+
distinct() |> filter(X5 >= ${params.mtbseq_min_reads} & X16 >= ${params.mtbseq_min_cov} & X19 >= ${params.mtbseq_min_depth}) |>
61+
select(X4) |> distinct()
62+
write.csv(df, "min.qual.genomes", quote = FALSE, row.names = FALSE)
63+
'
5164
sed 's/\t/,/g' tbdb-tbprofiler.txt | cut -d ',' -f1,2,3 > tmp.pairwise_analysis.list.csv
52-
grep -f min.qual.genomes tmp.pairwise_analysis.list.csv > pairwise_analysis.list.csv
65+
# remove any ';' which is used in the mixed lienages
66+
grep -f min.qual.genomes tmp.pairwise_analysis.list.csv | grep -v ';' > pairwise_analysis.list.csv
5367
touch pairwise_analysis.list.csv
5468
5569
"""
56-
}
70+
}

nextflow.config

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ params {
5252
"lineage4.7", "lineage4.8", "lineage4.9" ]
5353

5454
// MTBseq default values
55-
mtbseq_min_cov = 50 // minimum coverage as calculated by MTBseq
55+
mtbseq_min_cov = 0.95 // minimum breadth coverage as calculated by MTBseq for pairwise analysis
56+
mtbseq_min_depth = 50 // minimum depth coverage as calculated by MTBseq for pairwise analysis
57+
mtbseq_min_reads = 500000
5658
mtbseq_minbqual = 20
5759
mtbseq_mincovf = 4
5860
mtbseq_mincovr = 4
@@ -62,6 +64,11 @@ params {
6264
mtbseq_window = 10
6365
mtbseq_snp_distance = ["5", "10", "15"] // a tuple of all the distances to compared
6466

67+
// ONT parameters (WIP)
68+
ont_mtbseq_min_cov = 0.95 // minimum breadth coverage as calculated by MTBseq for pairwise analysis
69+
ont_mtbseq_min_depth = 30 // minimum depth coverage as calculated by MTBseq for pairwise analysis
70+
ont_mtbseq_min_reads = 10000
71+
6572
// IQ-Tree bootstraps
6673
iqtree_bootstraps = 1000
6774
iqtree_model = 'GTR+G4'
@@ -266,7 +273,7 @@ profiles {
266273
}
267274

268275
withName: MTBSEQ_LINEAGE_JOINT_AMEND {
269-
cpus = { Math.min(6 * task.attempt, 28) }
276+
cpus = { Math.min(8 * task.attempt, 28) }
270277
memory = { 16.GB * task.attempt }
271278
time = 24.hour
272279
errorStrategy = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }

0 commit comments

Comments
 (0)