2025-02-28

pjhesbest · pjhesbest · commit 2ba603929bf7 · 2025-02-28T11:12:10.000+01:00
diff --git a/bin/R/compile-sequencing-statistics.R b/bin/R/compile-sequencing-statistics.R
@@ -6,7 +6,6 @@ library(tidyverse)
     parser <- ArgumentParser(description = "Script to process MTBseq and TBProfiler data")
 
 # Define arguments
-parser$add_argument("--minimum_coverage", required=TRUE, type="integer", help="Minimum coverage threshold")
 parser$add_argument("--dictionary_path", default=NULL, help="Path to R dictioanry for renaming files")
 parser$add_argument("--runID", required=TRUE, help="RunID")
 
@@ -20,7 +19,6 @@ mtbseq_statistics     <- args$mtbseq_statistics
 mtbseq_classification <- args$mtbseq_classification
 tbprofiler_tbdb       <- args$tbprofiler_tbdb
 tbprofiler_who        <- args$tbprofiler_who
-minimum_coverage      <- args$minimum_coverage
 dictionary_path       <- args$dictionary_path
 lineage_fractions     <- args$lineage_fractions
 runID                 <- args$runID
@@ -73,12 +71,9 @@ resistance_profiles_WHO.df <- dictionary_rename(df = tbprofiler_who,
                         "/dict/resistance_profiles_WHO.csv"))
 
 # Create the list of genomes for pairwise analysis
-
 pairwise_analysis.df <- full.df.final |>
-                filter(`Unambiguous Coverage median` >= minimum_coverage) |>
-                filter(infection_type == "Clonal") |> #only clonal genomes
-                filter(sub_lineage != "NA") |> # no unclassified genomes
-                select(SampleID=FullID,main_lineage,sub_lineage)
+        select(SampleID=FullID,main_lineage,sub_lineage) |>
+        filter(main_lineage != "NA" & !str_detect(main_lineage, ";") & !str_detect(sub_lineage, ";"))
 
 # export all the ouputs (broken!)
 write.csv2(sequencing_summary.df,
diff --git a/bin/R/dict/mtbseq_statistict_main.dict.csv b/bin/R/dict/mtbseq_statistict_main.dict.csv
@@ -0,0 +1,25 @@
+Col,Name,Description,Minimum value
+X1,Date,The date of MTBseq execution,NA
+X2,SampleID,Sample ID (XX-XX),NA
+X3,LibraryID,Libeary ID (_XX),NA
+X4,FullID, Complete SampleID_LibraryID,NA
+X5,Total Reads,The total amount of sequenced reads,500000
+X6,Mapped Reads,Number of reads mapped to the reference genome,NA
+X7,% Mapped Reads,The percentage of reads mapped to the reference genome,NA
+X8,Genome Size,The size of the reference genome,NA
+X9,Genome GC,The GC content of the reference genome,NA
+X10,(Any) Total Bases,Number of nt the reference genome covered by reads,NA
+X11,% (Any) Total Bases,Percentage of the reference genome covered by reads,NA
+X12,(Any) GC-Content,GC content of the reference genome covered by reads,NA
+X13,(Any) Coverage mean,Mean coverage depth,NA
+X14,(Any) Coverage medianm,Median coverage depth,NA
+X15,(Unambiguous) Total Bases,Number of nt of the reference genome covered unambiguously,NA
+X16,% (Unambiguous) Total Bases,Percentage of the reference genome covered unambiguously,0.95
+X17,(Unambiguous) GC-Content,GC content of the reference genome covered unambiguously,NA
+X18,(Unambiguous) Coverage mean,Mean coverage depth of unambiguously covered positions,NA
+X19,(Unambiguous) Coverage median,Median coverage depth of unambiguously covered positions,50
+X20,SNPs,Number of detected SNPs,NA
+X21,Deletions,Number of detected deletions,NA
+X22,Insertions,Number of detected insertions,NA
+X23,Uncovered,Positions of the reference genome not covered by a read,NA
+X24,Substitutions (Including Stop Codons),Number of substitutions within genes,NA
diff --git a/modules/local/filtering/compile-sequencing-stats/main.nf b/modules/local/filtering/compile-sequencing-stats/main.nf
@@ -1,5 +1,12 @@
 process COMPILE_SEQUENCING_STATS {
 
+/*
+    In this module theWhoverall sequencing statistics for the BBDD are
+        calculated
+        TODO: Fix the Rscript compile-sequencing-statistics.R as it is generating
+            and incorrect pairwise_analysis.list.csv
+*/
+
     conda params.r_stats_env
 
     publishDir "${params.outdir}/bbdd/results/", mode: 'copy'
@@ -34,8 +41,9 @@ process COMPILE_SEQUENCING_STATS {
 
     # Generate summary statistics and create the sampleID,lineage df for
     ## creating into a channel TODO: need to fix this script in generating the output for tuplec creation
+        # the production of the pairwise_analysis.list.csv doest work
         Rscript ${params.r_script_dir}/compile-sequencing-statistics.R \\
-                    --minimum_coverage ${params.mtbseq_min_cov} \\
+                    --minimum_coverage ${params.mtbseq_min_depth} \\
                     --runID ${runID} \\
                     --dictionary_path ${params.r_script_dir}
 
@@ -47,10 +55,16 @@ process COMPILE_SEQUENCING_STATS {
         mv tmp.${runID}.sequencing_summary.csv ${runID}.sequencing_summary.csv
 
     # Create the file to go to the tuple seperation
-        awk -F "\t" '{ if ( \$14 > ${params.mtbseq_min_cov} ) print \$4 }' Mapping_and_Variant_Statistics.tab | sort | uniq > min.qual.genomes
+    Rscript -e 'library(tidyverse)
+                df <- read_delim("Mapping_and_Variant_Statistics.tab", delim = "\t", col_names = FALSE) |> 
+                    distinct() |> filter(X5 >= ${params.mtbseq_min_reads} & X16 >= ${params.mtbseq_min_cov} & X19 >= ${params.mtbseq_min_depth}) |>
+                    select(X4) |> distinct()
+                write.csv(df, "min.qual.genomes", quote = FALSE, row.names = FALSE)
+                '
         sed 's/\t/,/g' tbdb-tbprofiler.txt | cut -d ',' -f1,2,3 > tmp.pairwise_analysis.list.csv
-        grep -f min.qual.genomes tmp.pairwise_analysis.list.csv > pairwise_analysis.list.csv
+            # remove any ';' which is used in the mixed lienages
+        grep -f min.qual.genomes tmp.pairwise_analysis.list.csv | grep -v ';' > pairwise_analysis.list.csv
         touch pairwise_analysis.list.csv
 
     """
-}    
+}
diff --git a/nextflow.config b/nextflow.config
@@ -52,7 +52,9 @@ params {
                                 "lineage4.7", "lineage4.8", "lineage4.9" ]
 
     // MTBseq default values
-        mtbseq_min_cov          = 50 // minimum coverage as calculated by MTBseq
+        mtbseq_min_cov          = 0.95 // minimum breadth coverage as calculated by MTBseq for pairwise analysis
+        mtbseq_min_depth        = 50 // minimum depth coverage as calculated by MTBseq for pairwise analysis
+        mtbseq_min_reads        = 500000 
         mtbseq_minbqual         = 20
         mtbseq_mincovf          = 4
         mtbseq_mincovr          = 4
@@ -62,6 +64,11 @@ params {
         mtbseq_window           = 10
         mtbseq_snp_distance     = ["5", "10", "15"] // a tuple of all the distances to compared
 
+        // ONT parameters (WIP)
+        ont_mtbseq_min_cov      = 0.95 // minimum breadth coverage as calculated by MTBseq for pairwise analysis
+        ont_mtbseq_min_depth    = 30 // minimum depth coverage as calculated by MTBseq for pairwise analysis
+        ont_mtbseq_min_reads    = 10000 
+
     // IQ-Tree bootstraps
         iqtree_bootstraps       = 1000
         iqtree_model            = 'GTR+G4'
@@ -266,7 +273,7 @@ profiles {
             }
 
             withName: MTBSEQ_LINEAGE_JOINT_AMEND {
-                cpus                        = { Math.min(6 * task.attempt, 28) }
+                cpus                        = { Math.min(8 * task.attempt, 28) }
                 memory                      = { 16.GB * task.attempt }
                 time                        = 24.hour
                 errorStrategy               = { task.exitStatus in 137..140 ? 'retry' : 'terminate' }