2025-02-27

pjhesbest · pjhesbest · commit 635e238a84d3 · 2025-02-27T12:54:48.000+01:00
diff --git a/bin/R/create_pairwise_analysis_tuple.R b/bin/R/create_pairwise_analysis_tuple.R
@@ -14,11 +14,11 @@ colnames(sub_lineages) <- "selected_sub_lineage"
 run_ids <- readr::read_delim("run_sample_ids.txt", col_names = FALSE, delim = ",")
 colnames(run_ids) <- "SampleID"
 
-meta <- readr::read_delim("pairwise_analysis.list.csv", col_names = TRUE, delim = ",") |> 
-    distinct() |>
-    filter(!is.na(main_lineage) & !str_detect(main_lineage, ",")) |>
-    filter(!str_detect(sample, "CN-"))  # This line filters out any sample that contains 'CN-'
+meta <- readr::read_delim("pairwise_analysis.list.csv", col_names = FALSE, delim = ",") |> 
+    distinct() 
 colnames(meta) <- c("SampleID", "main_lineage", "sub_lineage")
+meta <- meta |> filter(!is.na(main_lineage) & !str_detect(main_lineage, ";")) |>
+    filter(!str_detect(SampleID, "CN-"))  # This line filters out any sample that contains 'CN-'
 
 # Filter out the lineages at sub_lineage level
 filtered_meta <- meta %>%
diff --git a/bin/shell/concatenate-variable-pylogeny-ancestors.sh b/bin/shell/concatenate-variable-pylogeny-ancestors.sh
@@ -19,9 +19,9 @@ mkdir -p Phylogeny/
             /^>/ {next}  # Skip header lines
             {
                 # Process sequence lines
-                for (i = 1; i <= length(\$0); i++) {
+                for (i = 1; i <= length($0); i++) {
                     position++
-                    print position "\t" substr(\$0, i, 1) >> "'"${lineage}.tmp.fasta_positions.tab"'"
+                    print position "\t" substr($0, i, 1) >> "'"${lineage}.tmp.fasta_positions.tab"'"
                 }
             }' "${lineage}.tmp.fasta"
                 
@@ -30,22 +30,22 @@ mkdir -p Phylogeny/
 
 # 3. obtain the reference positions (H37Rv) for the cluster positions
     for i in `cat ${lineage}.tmp.fasta_positions`; do 
-        sed -n \$((i+2))'p' ${tab} | cut -f3
+        sed -n $((i+2))'p' ${tab} | cut -f3
     done > ${lineage}.tmp_refseq
         
 # 4. convert column into fasta
     paste -s -d "" ${lineage}.tmp_refseq | sed '1i >H37Rv' > Phylogeny/${lineage}.ref-H37Rv.fasta
 
 # 5. get the genomic positions of the SNPs
     while read -r position; do
-        sed -n \$((position+2))'p' ${tab} | cut -f 1; 
+        sed -n $((position+2))'p' ${tab} | cut -f 1; 
     done < ${lineage}.tmp.fasta_positions > Phylogeny/${lineage}_genomic_positions.tab
 
     cp ${mtbc_ancestor_path} ${lineage}.tmp.MTB_anc.pos.gz; gunzip ${lineage}.tmp.MTB_anc.pos.gz
 
 # 6. Get the same SNPs for the 'ancestor' genomes
     for i in `cat Phylogeny/${lineage}_genomic_positions.tab`; do 
-        sed -n \${i}'p' ${lineage}.tmp.MTB_anc.pos | cut -f3 # doesnt need to +2 as the tsv file has no header
+        sed -n ${i}'p' ${lineage}.tmp.MTB_anc.pos | cut -f3 # doesnt need to +2 as the tsv file has no header
     done > ${lineage}.tmp.MTB_anc
 
 # 7. convert the column in fasta
diff --git a/main.nf b/main.nf
@@ -267,7 +267,8 @@ workflow {
                             who_out_ch,
                             sampleID_list
                         )
-
+                        
+            PAIRWISE_WF.out.pairwise_clusters.view()
         /*
         ······································································································
             SUMMARY WORKFLOW (SUMMARU_WF):
@@ -277,15 +278,15 @@ workflow {
                 - Generate MJN files for visualisation in PopArt
         ······································································································
         */
-
+/*
             SUMMARY_WF( params.runID,
                         PAIRWISE_WF.out.pairwise_clusters,
                         PAIRWISE_WF.out.analysis_summary,
                         PAIRWISE_WF.out.who_resistance,
                         PAIRWISE_WF.out.tbdb_resistance,
                         PAIRWISE_WF.out.phylogeny_plotting_ch
                     )
-
+*/
         /*
         ······································································································
             BARCODING ANALYSIS (BARCODING_WF)
diff --git a/modules/local/filtering/prepare_pairwise_channels/main.nf b/modules/local/filtering/prepare_pairwise_channels/main.nf
@@ -26,7 +26,7 @@ process PREPARE_PAIRWISE_CHANNELS {
         # Run the script to generate pairwise analysis tuples
             Rscript ${params.r_script_dir}/create_pairwise_analysis_tuple.R \\
                 1>>.command.out \\
-                2>>.command.err || true # i think this helps
+                2>>.command.err || true # i think this helps (?)
 
         # remove headers
         sed '/^lineage,SampleID/d' final.lineage_samples_tuple.csv | sort > tmp.final.lineage_samples_tuple.csv
diff --git a/modules/local/phylogeny/concatenated_snp_phylogeny/main.nf b/modules/local/phylogeny/concatenated_snp_phylogeny/main.nf
@@ -0,0 +1,47 @@
+process CONCATENATED_VARIABLE_REGION_PHYLOGENY {
+
+    tag "${runID}: ${lineage}"
+
+    conda params.phylogeny_env
+
+    publishDir "${params.outdir}/bbdd/mtbseq/pairwise/${lineage}/", mode: 'copy'
+
+    input:
+        val(runID)
+        tuple val(lineage), 
+                path(fasta), 
+                path(tab)
+
+    output:
+        path("Phylogeny/*")
+        
+        tuple val(lineage), path("Phylogeny/${lineage}_ML.contree"),
+                            path("Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.aln.fasta"), emit: phylogeny_plotting_ch
+
+    script:
+
+    def additional_args = task.ext.additional_args ?: '' // defined in the nextflow.config file
+
+    """
+    # Create the fasta files for the phylogeny
+        bash ${params.script_dir}/shell/concatenate-variable-pylogeny-ancestors.sh \\
+                ${fasta} ${lineage} \\
+                ${tab} ${params.mtbc_ancestor_path}
+
+        # Perform alignment of sequences 
+            mafft --auto --thread ${params.cpus} \\
+                    Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.fasta \\
+                    > Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.aln.fasta
+
+        # Perform phylogeny
+            iqtree -s Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.aln.fasta \\
+                    -m ${params.iqtree_model} \\
+                    -T AUTO \\
+                    -ntmax ${params.cpus} \\
+                    -B ${params.iqtree_bootstraps} \\
+                    --prefix ${lineage}_ML
+
+            mv ${lineage}_ML.* Phylogeny/
+    """
+
+}
diff --git a/modules/local/phylogeny/concatenated_snp_phylogeny/shell b/modules/local/phylogeny/concatenated_snp_phylogeny/shell
@@ -1,53 +1,3 @@
-process CONCATENATED_VARIABLE_REGION_PHYLOGENY {
-
-    tag "${runID}: ${lineage}"
-
-    conda params.phylogeny_env
-
-    publishDir "${params.outdir}/bbdd/mtbseq/pairwise/${lineage}/", mode: 'copy'
-
-    input:
-        val(runID)
-        tuple val(lineage), 
-                path(fasta), 
-                path(tab)
-
-    output:
-        path("Phylogeny/*")
-        
-        tuple val(lineage), path("Phylogeny/${lineage}_ML.contree"),
-                            path("Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.aln.fasta"), emit: phylogeny_plotting_ch
-
-    script:
-
-    def additional_args = task.ext.additional_args ?: '' // defined in the nextflow.config file
-
-    """
-    # Create the fasta files for the phylogeny
-        bash ${params.script_dir}/shell/concatenate-variable-pylogeny-ancestors.sh \\
-                ${fasta} \\
-                ${lineage} \\
-                ${tab} \\
-                ${params.mtbc_ancestor_path}
-
-        # Perform alignment of sequences 
-            mafft --auto --thread ${params.cpus} \\
-                    Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.fasta \\
-                    > Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.aln.fasta
-
-        # Perform phylogeny
-            iqtree -s Phylogeny/${lineage}.ref-H37Rv_MTBc-anc.aln.fasta \\
-                    -m ${params.iqtree_model} \\
-                    -T AUTO \\
-                    -ntmax ${params.cpus} \\
-                    -B ${params.iqtree_bootstraps} \\
-                    --prefix ${lineage}_ML
-
-            mv ${lineage}_ML.* Phylogeny/
-    """
-
-}
-
 /*
         mkdir -p Phylogeny/
 
diff --git a/workflows/pairwise_wf.nf b/workflows/pairwise_wf.nf
@@ -5,7 +5,7 @@ include { COMPILE_SEQUENCING_STATS }                from '../modules/local/filte
 include { PREPARE_PAIRWISE_CHANNELS }               from '../modules/local/filtering/prepare_pairwise_channels/main.nf'
 include { MTBSEQ_LINEAGE_JOINT_AMEND }              from '../modules/local/mtbseq/lineage_joint-amend/main.nf'
 include { MTBSEQ_LINEAGE_GROUP }                    from '../modules/local/mtbseq/lineage_group/main.nf'
-include { CONCATENATED_VARIABLE_REGION_PHYLOGENY }  from '../modules/local/phylogeny/concatenated_snp_phylogeny-nf'
+include { CONCATENATED_VARIABLE_REGION_PHYLOGENY }  from '../modules/local/phylogeny/concatenated_snp_phylogeny/main.nf'
 include { CONCATENATE_CLUSTERS }                    from '../modules/local/pairwise/concatenate-cluster-file/main.nf'
 
 workflow PAIRWISE_WF {