Merge pull request #4 from phesketh-igtp/development

phesketh-igtp · web-flow · commit 3a4dd165f413 · 2025-05-08T07:55:01.000+02:00
Development
diff --git a/README.md b/README.md
@@ -131,6 +131,5 @@ qsub -S /bin/bash -cwd -V -N nf-main \
         /path/to/RutiSeq-nf/main.nf \
         --samplesheet /path/to/RutiSeq-nf/test/samples.hpc.csv \
         --outdir /path/to/RutiSeq-nf/RutiSeq-test \
-        -profile igtp,conda_on 
-# this specifies that the job should be submitted to the IGTP HPC using conda
+        -profile hpc_sungrid_engine,conda_on
 ```
diff --git a/bin/R/create_pairwise_analysis_tuple.R b/bin/R/create_pairwise_analysis_tuple.R
@@ -43,7 +43,6 @@ filtered_meta <- meta %>%
 filtered_lineages_forward <- filtered_meta |> 
     filter(SampleID %in% run_ids$SampleID) |> 
     count(lineage) |> 
-    filter(n > 4) |> # rm lineage with less than 4 genome (min for MTBSeq)
     select(lineage) |> 
     distinct()  # Use distinct() instead of unique() for dplyr consistency
 
diff --git a/envs/conda/r-phylogeny_env.yml b/envs/conda/r-phylogeny_env.yml
@@ -3,6 +3,7 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
+  - conda-forge::r-base=4.4.2
   - bioconda::bioconductor-ggtree=3.14.0
   - bioconda::bioconductor-treeio=1.30.0
   - conda-forge::r-ape=5.8_1
diff --git a/main.nf b/main.nf
@@ -3,7 +3,6 @@
 nextflow.enable.dsl = 2
 
 include { FILE_CHECK }                  from './modules/local/file-checks/main.nf'
-include { TBPROFILER_DB_UPDATE }        from './modules/local/tbprofiler/db-update/main.nf'
 include { TAXONKIT_DB_UPDATE }          from './modules/local/taxonkit/db-update/main.nf'
 //include { NEGATIVE_CTRL_WF }            from './workflows/negative_ctrl_wf.nf'
 include { SINGLE_WF }                   from './workflows/single_wf.nf'
@@ -168,12 +167,11 @@ workflow {
 
         /*
         ······································································································
-            UPDATING THE DATABASE (TBPROFILER_DB_UPDATE)
+            UPDATING THE DATABASE
                 - The TBProfiler database is updated with the latest version of the database
         ······································································································
         */
 
-            TBPROFILER_DB_UPDATE( params.runID )
             TAXONKIT_DB_UPDATE( params.runID )
 
         /*
@@ -188,7 +186,6 @@ workflow {
 /*
             NEGATIVE_CTRL_WF( params.runID,
                                 controls_ch, 
-                                TBPROFILER_DB_UPDATE.out.tbprofiler_update_db,
                                 TAXONKIT_DB_UPDATE.out.taxonkit_update_db
                             )
 */
@@ -249,7 +246,7 @@ workflow {
         ······································································································
         */
 
-            SINGLE_WF( params.runID, comp_samples_ch, TBPROFILER_DB_UPDATE.out.tbprofiler_update_db )
+            SINGLE_WF( params.runID, comp_samples_ch )
                     
                 // DEBUG: Demonstrate the content of the channel
                 ///     SINGLE_WF.out.single_updated_samples_ch.view { sample -> "Sample: $sampleID" }
@@ -288,10 +285,12 @@ workflow {
                 tbdb_out_ch         =   tbdb_out_files.collect()
                 who_out_ch          =   who_out_files.collect()
 
-            PAIRWISE_WF( params.runID, mtbseq_stats_ch,
-                        mtbseq_class_ch, tbdb_out_ch,
-                        who_out_ch, sampleID_list,
-                        TBPROFILER_DB_UPDATE.out.tbprofiler_update_db
+            PAIRWISE_WF( params.runID, 
+                        mtbseq_stats_ch,
+                        mtbseq_class_ch, 
+                        tbdb_out_ch,
+                        who_out_ch, 
+                        sampleID_list
                         )
 
         /*
diff --git a/modules/local/pairwise/concatenate-cluster-file/main.nf b/modules/local/pairwise/concatenate-cluster-file/main.nf
@@ -20,7 +20,7 @@ process CONCATENATE_CLUSTERS {
         path(clusters)
 
     output:
-        path("unprocessed_clusters.tsv"),          emit: bbdd_clusters
+        path("unprocessed_clusters.tsv"),          emit: pairwise_clusters
 
     script:
 
@@ -29,7 +29,7 @@ process CONCATENATE_CLUSTERS {
             echo "lineage\tdistance\tgenomes\tgroup" > unprocessed_clusters.tsv
 
         # Concatenate all files
-            for file in ${params.outDir}/bbdd/mtbseq/pairwise/*/Groups/*clusters.tsv; do 
+            for file in ${params.outDir}/bbdd/mtbseq/pairwise/*/Groups/*_d*.clusters.tsv; do 
                 cat \$file >> unprocessed_clusters.tsv
             done
 
diff --git a/modules/local/summary/generate-nexus/main.nf b/modules/local/summary/generate-nexus/main.nf
@@ -59,72 +59,4 @@ process GENERATE_NEXUS {
 
         """
 
-}
-
-/*
-        # create the output and temporary directories
-            mkdir -p nexus/ fasta/ positions/
-
-        # create the list of genomes within the cluster
-            grep "${clusterID}" ${pairwise_clusters} \\
-                | cut -f1 > ${clusterID}.genomes.list
-
-        #·················································································#
-
-        # create cluster directory and split up fasta file in cluster fastas
-            while IFS=";" read -r genome; do
-                seqkit grep -w 0 -n -p \${genome} ${snp_fasta} >> ${clusterID}.fasta
-            done < ${clusterID}.genomes.list
-
-        # run snp-sites on the fastas
-            snp-sites ${clusterID}.fasta > ${clusterID}.snpsites.fasta
-            snp-sites ${clusterID}.fasta -v | cut -f2 \\
-                | sed '1,4d' > positions/${clusterID}_positions.tab
-
-        #·················································································#
-
-        # H37Rv variance positions 
-            for i in `cat positions/${clusterID}_positions.tab`; do 
-                sed -n \$((i+2))'p' ${snp_tab} | cut -f3
-            done > ${clusterID}_tmp_refseq
-
-                # convert column into fasta
-                paste -s -d "" ${clusterID}_tmp_refseq \\
-                    | sed '1i >H37Rv' > ${clusterID}_H37Rv.fasta
-
-        #·················································································#
-
-        # Get genomic positions
-            while read -r position; do
-                sed -n \$((position+2))'p' ${snp_tab} | cut -f 1; 
-            done < positions/${clusterID}_positions.tab > positions/${clusterID}_genomic_positions.tab
-
-        #·················································································#
-
-        # Valencian ancestor (MTB_anc) variance positions
-            cp ${params.mtbc_ancestor_path} ${lineage}.tmp.MTB_anc.pos.gz
-            gunzip ${lineage}.tmp.MTB_anc.pos.gz
-
-            for i in `cat positions/${clusterID}_genomic_positions.tab`; do 
-                sed -n \${i}'p' ${lineage}.tmp.MTB_anc.pos | cut -f3
-            done > ${clusterID}_tmp_MTB_anc
-
-            # convert the column in fasta
-                paste -s -d "" ${clusterID}_tmp_MTB_anc \\
-                    | sed '1i >MTB_anc' > ${clusterID}_MTB_anc.fasta
-
-        # remove the large tab file
-            rm -rf ${lineage}.tmp.MTB_anc.pos.gz
-
-        #·················································································#
-
-        # Create final FASTA file
-            cat ${clusterID}.snpsites.fasta \\
-                ${clusterID}_H37Rv.fasta \\
-                ${clusterID}_MTB_anc.fasta \\
-                > fasta/${clusterID}_refseq.fasta
-
-        # convert to nexus for visualisation
-            seqret -osformat2 nexus -sequence fasta/${clusterID}_refseq.fasta \\
-                -outseq nexus/${clusterID}_refseq.nex
-*/
+}
diff --git a/modules/local/summary/process-clusters/main.nf b/modules/local/summary/process-clusters/main.nf
@@ -12,7 +12,7 @@ process PROCESS_CLUSTERS {
 */
 
     conda params.r_stats_env
-    
+
     publishDir "${params.outDir}/results/${runID}/clusters/", mode: 'copy'
 
     input:
@@ -25,12 +25,18 @@ process PROCESS_CLUSTERS {
         path("${runID}_processed_clusters.tsv")
 
     script:
-    """
-    Rscript ${params.r_script_dir}/process_clusters.R #\
-            #--clusters ${pairwise_clusters} \
-            #--summary ${analysis_summary}
+        """
+
+        ${pairwise_clusters} 
+
+        Rscript ${params.r_script_dir}/process_clusters.R #\
+                #--clusters ${pairwise_clusters} \
+                #--summary ${analysis_summary}
+
+            cp processed_clusters.tsv ${runID}_processed_clusters.tsv
 
-    cp processed_clusters.tsv ${runID}_processed_clusters.tsv
-    """
+        cp processed_clusters.tsv ${params.outDir}/results/
+        cp ${pairwise_clusters} ${params.outDir}/results/
+        """
 
 }
diff --git a/modules/local/tbprofiler/compile.tbdb/main.nf b/modules/local/tbprofiler/compile.tbdb/main.nf
@@ -33,7 +33,6 @@ process TBPROFILER_COMPILE_TBDB {
     input:
         val(runID)
         path(tbprofiler_results)
-        path(tbprofiler_update_db)
 
     output:
         path("tbdb-tbprofiler.txt"),                 emit: tbdb_results
diff --git a/modules/local/tbprofiler/compile.who/main.nf b/modules/local/tbprofiler/compile.who/main.nf
@@ -29,7 +29,6 @@ process TBPROFILER_COMPILE_WHO {
     input:
         val runID
         path (tbprofiler_who_results)
-        path(tbprofiler_update_db)
 
     output:
         path("who-tbprofiler.txt"),         emit: who_results
diff --git a/modules/local/tbprofiler/profile.tbdb/main.nf b/modules/local/tbprofiler/profile.tbdb/main.nf
@@ -29,7 +29,6 @@ process TBPROFILER_PROFILE_TBDB {
             path(mtbc_forward), path(mtbc_reverse), path(mtbseq_class), 
             path(mtbseq_stats), path(mtbseq_pos), path(mtbseq_vars), 
             path(tbdb_out), path(who_out), path(mtbseq_vcf)
-        path(tbprofiler_update_handover)
 
     output:
         path("bam/tbdb-${sampleID}.bam")
diff --git a/modules/local/tbprofiler/profile.who/main.nf b/modules/local/tbprofiler/profile.who/main.nf
@@ -28,7 +28,6 @@ process TBPROFILER_PROFILE_WHO {
         tuple val(sampleID), path(mtbc_forward), path(mtbc_reverse), path(mtbseq_class), 
                 path(mtbseq_stats), path(mtbseq_pos), path(mtbseq_vars), 
                 path(tbdb_out), path(who_out), path(mtbseq_vcf)
-        path(tbprofiler_update_handover)
 
     output:
         path("results/who-${sampleID}.results.json")
diff --git a/nextflow.config b/nextflow.config
@@ -131,7 +131,7 @@ profiles {
             singularity.cacheDir    = "${params.envsDir}/nf-singularity/"
     }
 
-    igtp {
+    SGE {
         process {
             executor            = 'sge'
             penv                = 'smp'
diff --git a/png/g12.pdf b/png/g12.pdf
diff --git a/png/g12.png b/png/g12.png
diff --git a/png/image1-09.png b/png/image1-09.png
diff --git a/png/mjn-icon.svg b/png/mjn-icon.svg
diff --git a/submit-nf.sh b/submit-nf.sh
@@ -1,5 +1,16 @@
 #!/bin/bash
+
+## Thank you to @dmdmckeow for the original script and inspriration
+## @author: Dean A McKeown and Poppy J Hesketh-Best
+## @version: v1.0.0
+## @description: This script is used to submit a Nextflow pipeline to 
+##               an HPC cluster using qsub. It sets up the environment, 
+##               runs the pipeline, and handles job cancellation.
+## @changelog:
+##   v1.0.0-2024-11-01 : Initial version
+
 green='\033[32m';red='\033[31m';cyan='\033[36m';purple='\033[35m';nocolor='\033[m'
+
 ### 
 
 eval "$(conda shell.bash hook)"
@@ -10,7 +21,7 @@ set -u          # exit immediately if using undefined variables
 set -o pipefail # ensure bash pipelines return non-zero status if any of their command fails
 
 # Setup trap function to be run when canceling the pipeline job. It will propagate the SIGTERM signal
-# to Nextlflow so that all jobs launched by the pipeline will be cancelled too.
+# to Nextflow so that all jobs launched by the pipeline will be cancelled too.
 _term() {
         echo "Caught SIGTERM signal!"
         kill -s SIGTERM $pid
@@ -23,15 +34,10 @@ trap _term TERM
 export NXF_JVM_ARGS="-Xms2g -Xmx5g"
 
 # Run the pipeline. The command uses the arguments passed to this script, e.g:
-#
-# $ qsub -S /bin/bash -cwd -V -N nf-main -o qsub-nf.out -l mem_free=6G submit-nf.sh main.nf --samplesheet test/samples.hpc.csv --outdir RutiSeq -profile igtp,singularity_on
-# Convenience::
-# $ rm -rf qsub-nf.out .nextflow* nf-main.e6272*; qsub -S /bin/bash -cwd -V -N nf-main -o qsub-nf.out -l mem_free=6G submit-nf.sh main.nf --samplesheet test/samples.hpc.csv --outdir RutiSeq -profile igtp,singularity_on; sleep 2s; tail -f qsub-nf.out
-/imppc/labs/emlab/phesketh/bin/nextflow run "$@" -ansi-log false & pid=$!
 
-echo -e "Running:       
-                nextflow run "$@" -ansi-log false & pid=$!
-"
+nextflow run "$@" -ansi-log false & pid=$!
+
+echo -e "Running: nextflow run "$@" -ansi-log false & pid=$!\n"
 
 echo -e "${red}$(date +'%d/%m/%Y %H:%M:%S')${nocolor}	qsub -S /bin/bash -cwd -V -N nf-main -o qsub-nf.out -l mem_free=6G submit-nf.sh "$@"" >> submit-nf.log
 
@@ -40,7 +46,4 @@ echo "Waiting for ${pid}"
 wait $pid
 
 # Return 0 exit-status if everything went well
-exit 0
-
-### SUBMIT TO HPC:
-### qsub -S /bin/bash -cwd -V -N nf-main -o qsub-nf.out -l mem_free=6G submit-nf.sh /imppc/labs/emlab/share/GitHub/RutiSeq-nf/main.nf -profile igtp,conda_on --samplesheet sample-sheet.csv --runID 'test-03' --outdir /imppc/labs/emlab/phesketh/projects/RutiSeq-nf/test --workDir /imppc/labs/emlab/phesketh/projects/RutiSeq-nf/work; sleep 2s; tail -f qsub-nf.out
+exit 0
diff --git a/workflows/pairwise_wf.nf b/workflows/pairwise_wf.nf
@@ -17,7 +17,6 @@ workflow PAIRWISE_WF {
         tbdb_out_ch
         who_out_ch
         sampleID_list
-        tbprofiler_update_db
 
     main:
 
@@ -28,8 +27,8 @@ workflow PAIRWISE_WF {
         def no_col  = '\u001B[0m'
 
         // Compile TB-Profiler results
-            TBPROFILER_COMPILE_TBDB( runID, tbdb_out_ch, tbprofiler_update_db )
-            TBPROFILER_COMPILE_WHO( runID, who_out_ch, tbprofiler_update_db )
+            TBPROFILER_COMPILE_TBDB( runID, tbdb_out_ch )
+            TBPROFILER_COMPILE_WHO( runID, who_out_ch )
 
         // Compile stats and classifications from MTBSeq
             MTBSEQ_STATS_COMPILE( mtbseq_stats_ch, mtbseq_class_ch )
@@ -103,7 +102,7 @@ workflow PAIRWISE_WF {
                 //nexus_creation_ch.view()
 
     emit:
-        pairwise_clusters     = CONCATENATE_CLUSTERS.out.bbdd_clusters
+        pairwise_clusters     = CONCATENATE_CLUSTERS.out.pairwise_clusters
         analysis_summary      = COMPILE_SEQUENCING_STATS.out.analysis_summary
         who_resistance        = COMPILE_SEQUENCING_STATS.out.who_resistance
         tbdb_resistance       = COMPILE_SEQUENCING_STATS.out.tbdb_resistance
diff --git a/workflows/single_wf.nf b/workflows/single_wf.nf
@@ -2,7 +2,6 @@ include { MTBC_READ_QC }              from '../modules/local/pre-wf-check/mtbc-r
 include { TBPROFILER_PROFILE_TBDB }   from '../modules/local/tbprofiler/profile.tbdb/main.nf'
 include { TBPROFILER_PROFILE_WHO }    from '../modules/local/tbprofiler/profile.who/main.nf'
 include { MTBSEQ_SINGLE }             from '../modules/local/mtbseq/single/main.nf'
-//include { MTBSEQ_ONT_SINGLE }         from '../modules/local/mtbseq-ont/single/main.nf'
 include { SNP_PROFILING_SINGLE }      from '../modules/local/snp-barcoding/single.profiling/main.nf'
 include { SNP_ANNOTATING_SINGLE }     from '../modules/local/snp-barcoding/single.annotating/main.nf'
 include { POST_SINGLE_BBDD_CLEANUP }  from '../modules/local/post-wf-cleaup/single-bbdd-cleanup/main.nf'
@@ -16,7 +15,6 @@ workflow SINGLE_WF {
     take:
         runID
         comp_samples_ch
-        tbprofiler_update_db
 
     main:
 
@@ -58,9 +56,9 @@ workflow SINGLE_WF {
             MTBC_READ_QC( branched_channel.with_reads )
 
         // Run TBPROFILER_PROFILE_TBDB after MTBC_READ_QC is done
-            TBPROFILER_PROFILE_TBDB( MTBC_READ_QC.out.updated_sample_ch1, tbprofiler_update_db )
+            TBPROFILER_PROFILE_TBDB( MTBC_READ_QC.out.updated_sample_ch1 )
 
-            TBPROFILER_PROFILE_WHO( TBPROFILER_PROFILE_TBDB.out.updated_sample_ch2, tbprofiler_update_db )
+            TBPROFILER_PROFILE_WHO( TBPROFILER_PROFILE_TBDB.out.updated_sample_ch2 )
 
         // Run MTBSEQ_SINGLE
             MTBSEQ_SINGLE( TBPROFILER_PROFILE_WHO.out.updated_sample_ch3 )