From cb5f1398f090e2c7abe0e3eef76e793a7e8bc1d0 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Tue, 3 Jun 2025 10:16:12 +1000 Subject: [PATCH 1/7] genesROI.R: script to write csv of genes in region --- Rtools/malDrugR/genesROI.R | 73 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 Rtools/malDrugR/genesROI.R diff --git a/Rtools/malDrugR/genesROI.R b/Rtools/malDrugR/genesROI.R new file mode 100644 index 0000000..79e94a2 --- /dev/null +++ b/Rtools/malDrugR/genesROI.R @@ -0,0 +1,73 @@ +suppressPackageStartupMessages(library(stringr)) +suppressPackageStartupMessages(library(readr)) +suppressPackageStartupMessages(library(genomeIntervals)) ## Used for readGff3 +suppressPackageStartupMessages(library(GenomicRanges)) +suppressPackageStartupMessages(library(argparser)) + +#### Read command-line arguments #### +argp <- arg_parser(paste( + "Use a reference genomic-features file (.gff)", + "to find genes in a region of interest" +)) +argp <- add_argument(argp, "--refpath", + help = "file path of reference" +) +argp <- add_argument(argp, "--refstrain", + help = "strain name for reference" +) +argp <- add_argument(argp, "--region", + type = "character", + help = "Regions are specified as: seqname[:STARTPOS[-ENDPOS]] e.g. 08:409282-429234" +) + +argv <- parse_args(argp) + +refDir <- argv$refpath +pfCurrRefs <- data.frame( + strain = c("3D7", "Dd2", "Supp"), + version = c("52", "57", "52") +) +ref <- filter(pfCurrRefs, strain == argv$refstrain) +## Not looking for regions of interest in supplementary sequences yet + +# ---------- Read genomic features -------------------------------------------- +file.gff <- file.path( + refDir, + paste0( + "PlasmoDB-", ref$version, "_Pfalciparum", ref$strain, + ".gff" + ) +) + +pf_features <- tryCatch( + readGff3(file.gff, quiet = TRUE), + error = function(e) { + stop(paste(e, "Filepath:", file.gff)) + } +) +# ---------- Find features in region of interest ------------------------------ +chrname <- str_remove(argv$region, ":.+") +gstart <- str_remove(argv$region, ".+:") |> + str_remove("-.+") +gend <- str_remove(argv$region, ".+:") |> + str_remove(".+-") +roigr <- GRanges( + seqnames = levels(seqnames(pf_features)) |> + str_subset(paste0("Pf", ref$strain, "_", chrname)), + ranges = IRanges(start = gstart, end = gend) +) +featROIidx <- findOverlaps(GRanges(pf_features), roigr, + type = "within", select = "last" +) +featROI <- pf_features[!is.na(featROIidx)] +# ---------- Filter to genes and write to csv ------------------------------ +geneROI <- featROI[ + annotation(featROI)$type == "protein_coding_gene", +] + +data.frame(ID = getGffAttribute(geneROI, "ID"), + GeneName = getGffAttribute(geneROI, "Name"), + Description = getGffAttribute(geneROI, "description") |> + str_replace_all("\\+", " ") +) |> + write_csv(paste0(argv$samplegroup, "genes_chr", chrname, "_ROI.csv")) From ad381613a962a00fd769b83f0ab40c4dd6df1971 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Wed, 2 Jul 2025 14:53:49 +1000 Subject: [PATCH 2/7] add genesROI process (1st draft) --- Rtools/malDrugR/genesROI.R | 3 +++ main.nf | 17 +++++++++++++---- modules/stvariant.nf | 22 ++++++++++++++++++++++ nextflow.config | 4 +++- nextflow_schema.json | 4 ++++ 5 files changed, 45 insertions(+), 5 deletions(-) diff --git a/Rtools/malDrugR/genesROI.R b/Rtools/malDrugR/genesROI.R index 79e94a2..5189dc8 100644 --- a/Rtools/malDrugR/genesROI.R +++ b/Rtools/malDrugR/genesROI.R @@ -9,6 +9,9 @@ argp <- arg_parser(paste( "Use a reference genomic-features file (.gff)", "to find genes in a region of interest" )) +argp <- add_argument(argp, "--samplegroup", + help = "Group name of related samples" +) argp <- add_argument(argp, "--refpath", help = "file path of reference" ) diff --git a/main.nf b/main.nf index 486db19..40c398b 100644 --- a/main.nf +++ b/main.nf @@ -26,7 +26,8 @@ include { Bcf; FilterBcfVcf; FilterGridssSV; RPlotFull; - RPlotROI + RPlotROI; + genesROI } from './modules/stvariant.nf' @@ -140,7 +141,8 @@ workflow { .join(bamlist_ch) .combine(bam_ch.bamnodup,by:0) .groupTuple(by:[0,1,2,3]) - .combine(parent_ch.map{row->tuple(row[1],row[0])},by:1).set{bcf_input_ch} // Emits val(parentId),val(groupId),path(ref), path(bamlist), path(bams),path(parentbams) + .combine(parent_ch.map{row->tuple(row[1],row[0])},by:1).set{bcf_input_ch} + // Emits val(parentId),val(groupId),path(ref), path(bamlist), path(bams),path(parentbams) bcf_ch=Bcf(bcf_input_ch) //----------------------Gridss------------------------------------- input_ch.map{row -> tuple(row[0],row[4],row[5]+".fasta")} @@ -186,7 +188,8 @@ workflow { Input to CopyNum Analysis is empty. """) } - .set{copynum_input_ch}//Emits val(groupId),val(parentId),path(refpath),val(bsref),path(mergedparent), path(bamlistcontent), path(bams), val(dummy) + .set{copynum_input_ch} + //Emits val(groupId),val(parentId),path(refpath),val(bsref),path(mergedparent), path(bamlistcontent), path(bams), val(dummy) copynum_ch=RCopyNum(copynum_input_ch .combine(Channel.fromList( [params.bin_CNroi, params.bin_CNfull] )) @@ -223,7 +226,7 @@ workflow { .combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/gridss_majorityfilt.R")) .combine(Channel.fromPath("${projectDir}/Rtools/gridss_assets/")) ) - //----------------------Plot----------------------------------------- + //----------------------Plot CopyNums----------------------------------------- // Input Channel Emits val(groupId),val(parentId),path(rds) input_ch.map{row -> tuple(row[0],row[4])} .unique() @@ -231,6 +234,12 @@ workflow { RPlotFull(plot_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/copynumPlotsFull.R"))) RPlotROI(plot_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/copynumPlotsROI.R"))) + //----------------------List genes in a region of interest, if specified------ + // Input Channel emits val(groupId) + input_ch.map{row -> row[0]}.unique().set{genesROI_input_ch} + + if (params.genesRegion != "") + genesROI(genesROI_input_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/genesROI.R"))) //------------------------------------------------------------------- } diff --git a/modules/stvariant.nf b/modules/stvariant.nf index e7fcb10..3e565f7 100644 --- a/modules/stvariant.nf +++ b/modules/stvariant.nf @@ -227,5 +227,27 @@ process RPlotROI { --endROI ${params.end_CNroi} """ } +process genesROI { + label 'Rfilter' + tag "${groupId}" + + publishDir "${params.outdir}/variants/copynumPlots", mode: 'copy' + + input: + tuple val(groupId),path(refpath),val(prefix), + val(geneRegion), path(script) + + output: + tuple val(groupId), path("*.csv") + + script: + """ + Rscript --vanilla ${projectDir}/Rtools/malDrugR/genesROI.R \ + --samplegroup ${groupId} \ + --refpath ${refpath} \ + --refstrain ${prefix} \ + --region ${params.geneRegion} + """ +} diff --git a/nextflow.config b/nextflow.config index c0588dd..2bd9ef8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,8 +12,10 @@ params { start_CNroi = 100 end_CNroi = 600 - BCFqualcrit = 50 + BCFqualcrit = 50 critsamplecount = 1 + + geneRegion = "" //Defaults ref3D7_path = "" diff --git a/nextflow_schema.json b/nextflow_schema.json index fcf6887..05fde74 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -189,6 +189,10 @@ "end_CNroi": { "type": "integer", "default": 435 + }, + "genesRegion": { + "type": "string", + "default: "" } }, "required": [ From ac89857e1bada9ceb847947af9cbbd0b244e2988 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Thu, 3 Jul 2025 09:47:23 +1000 Subject: [PATCH 3/7] genesROI: drop samplegroup and fix gene/genes errors --- Rtools/malDrugR/genesROI.R | 5 +---- main.nf | 6 ++---- modules/stvariant.nf | 8 +++----- nextflow.config | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/Rtools/malDrugR/genesROI.R b/Rtools/malDrugR/genesROI.R index 5189dc8..1374112 100644 --- a/Rtools/malDrugR/genesROI.R +++ b/Rtools/malDrugR/genesROI.R @@ -9,9 +9,6 @@ argp <- arg_parser(paste( "Use a reference genomic-features file (.gff)", "to find genes in a region of interest" )) -argp <- add_argument(argp, "--samplegroup", - help = "Group name of related samples" -) argp <- add_argument(argp, "--refpath", help = "file path of reference" ) @@ -73,4 +70,4 @@ data.frame(ID = getGffAttribute(geneROI, "ID"), Description = getGffAttribute(geneROI, "description") |> str_replace_all("\\+", " ") ) |> - write_csv(paste0(argv$samplegroup, "genes_chr", chrname, "_ROI.csv")) + write_csv(paste0("genes_chr", chrname, "_ROI.csv")) diff --git a/main.nf b/main.nf index 40c398b..1745e6f 100644 --- a/main.nf +++ b/main.nf @@ -234,15 +234,13 @@ workflow { RPlotFull(plot_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/copynumPlotsFull.R"))) RPlotROI(plot_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/copynumPlotsROI.R"))) + //----------------------List genes in a region of interest, if specified------ // Input Channel emits val(groupId) - input_ch.map{row -> row[0]}.unique().set{genesROI_input_ch} + input_ch.map{row -> tuple(row[6],row[3])}.unique().set{genesROI_input_ch} if (params.genesRegion != "") genesROI(genesROI_input_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/genesROI.R"))) //------------------------------------------------------------------- } - - - diff --git a/modules/stvariant.nf b/modules/stvariant.nf index 3e565f7..fd4df52 100644 --- a/modules/stvariant.nf +++ b/modules/stvariant.nf @@ -234,19 +234,17 @@ process genesROI { publishDir "${params.outdir}/variants/copynumPlots", mode: 'copy' input: - tuple val(groupId),path(refpath),val(prefix), - val(geneRegion), path(script) + tuple path(refpath),val(prefix) output: - tuple val(groupId), path("*.csv") + path("*.csv") script: """ Rscript --vanilla ${projectDir}/Rtools/malDrugR/genesROI.R \ - --samplegroup ${groupId} \ --refpath ${refpath} \ --refstrain ${prefix} \ - --region ${params.geneRegion} + --region ${params.genesRegion} """ } diff --git a/nextflow.config b/nextflow.config index 2bd9ef8..ff38b90 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,7 +15,7 @@ params { BCFqualcrit = 50 critsamplecount = 1 - geneRegion = "" + genesRegion = "" //Defaults ref3D7_path = "" From e4c28356fcd2c48745d6c11109e8ce9b74b2e184 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Thu, 3 Jul 2025 16:46:25 +1000 Subject: [PATCH 4/7] stvariant: bug fix. Don't input full bamlist to RCopyNum --- Rtools/malDrugR/copynumQDNAseq.R | 2 +- modules/stvariant.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Rtools/malDrugR/copynumQDNAseq.R b/Rtools/malDrugR/copynumQDNAseq.R index ddfc831..ef3fedb 100644 --- a/Rtools/malDrugR/copynumQDNAseq.R +++ b/Rtools/malDrugR/copynumQDNAseq.R @@ -44,7 +44,7 @@ groupId <- argv$samplegroup parentID <- argv$parentId refDir <- argv$refDir groupbamL <- strsplit(argv$bams, " ")[[1]] -groupbamL <- groupbamL[!grepl(parentID, groupbamL)] +groupbamL <- groupbamL[!grepl(parentID, groupbamL)] # doesn't work for merged parents sampleL <- sub("_nodup\\.bam$", "", groupbamL) library(argv$bsref, diff --git a/modules/stvariant.nf b/modules/stvariant.nf index fd4df52..052189e 100644 --- a/modules/stvariant.nf +++ b/modules/stvariant.nf @@ -118,7 +118,7 @@ process RCopyNum { Rscript --vanilla ${script} \ --samplegroup ${groupId} \ --parentId ${parentId} \ - --bams "${bamfilenames}" \ + --bams "${bams}" \ --bin_in_kbases ${bins} \ --refDir ${refpath}\ --bsref ${bsref} From 5e72901970a663e3a81a4c5419c573ce2551504d Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Thu, 3 Jul 2025 17:07:30 +1000 Subject: [PATCH 5/7] remove bamfilenames from input to process RCopyNum --- main.nf | 9 ++------- modules/stvariant.nf | 3 +-- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/main.nf b/main.nf index 486db19..a46d008 100644 --- a/main.nf +++ b/main.nf @@ -174,19 +174,14 @@ workflow { .unique() .combine(merged_ch,by:0) .map{row -> if (row[0]) tuple(row[1],row[0],row[2],row[3],row[4])} - .join(bamlist_ch.map{gid,filepath -> - def fileLines = filepath.readLines() - def fileContents = fileLines.join(' ') - return tuple(gid, fileContents) - }) .combine(bam_ch.bamnodup,by:0) - .groupTuple(by:[0,1,2,3,4,5]) + .groupTuple(by:[0,1,2,3,4]) .ifEmpty { error(""" Input to CopyNum Analysis is empty. """) } - .set{copynum_input_ch}//Emits val(groupId),val(parentId),path(refpath),val(bsref),path(mergedparent), path(bamlistcontent), path(bams), val(dummy) + .set{copynum_input_ch}//Emits val(groupId),val(parentId),path(refpath),val(bsref),path(mergedparent), path(bams), val(dummy) copynum_ch=RCopyNum(copynum_input_ch .combine(Channel.fromList( [params.bin_CNroi, params.bin_CNfull] )) diff --git a/modules/stvariant.nf b/modules/stvariant.nf index e7fcb10..6dc788a 100644 --- a/modules/stvariant.nf +++ b/modules/stvariant.nf @@ -106,7 +106,6 @@ process RCopyNum { tuple val(groupId),val(parentId),path(refpath),val(bsref), path(mergedparent), - val(bamfilenames), path(bams), val(bins),path(script) @@ -118,7 +117,7 @@ process RCopyNum { Rscript --vanilla ${script} \ --samplegroup ${groupId} \ --parentId ${parentId} \ - --bams "${bamfilenames}" \ + --bams "${bams}" \ --bin_in_kbases ${bins} \ --refDir ${refpath}\ --bsref ${bsref} From 8d064ab01dc2a24ca4084102892794e1e5c2c8e7 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Wed, 9 Jul 2025 15:49:58 +1000 Subject: [PATCH 6/7] genesROI: add script to input tuple --- main.nf | 2 +- modules/stvariant.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 1745e6f..503a1f8 100644 --- a/main.nf +++ b/main.nf @@ -236,7 +236,7 @@ workflow { RPlotROI(plot_ch.combine(Channel.fromPath("${projectDir}/Rtools/malDrugR/copynumPlotsROI.R"))) //----------------------List genes in a region of interest, if specified------ - // Input Channel emits val(groupId) + // Input Channel emits path(refpath), val(ref_prefix) input_ch.map{row -> tuple(row[6],row[3])}.unique().set{genesROI_input_ch} if (params.genesRegion != "") diff --git a/modules/stvariant.nf b/modules/stvariant.nf index 052189e..0bff05f 100644 --- a/modules/stvariant.nf +++ b/modules/stvariant.nf @@ -234,7 +234,7 @@ process genesROI { publishDir "${params.outdir}/variants/copynumPlots", mode: 'copy' input: - tuple path(refpath),val(prefix) + tuple path(refpath),val(prefix),path(script) output: path("*.csv") @@ -244,7 +244,7 @@ process genesROI { Rscript --vanilla ${projectDir}/Rtools/malDrugR/genesROI.R \ --refpath ${refpath} \ --refstrain ${prefix} \ - --region ${params.genesRegion} + --region "${params.genesRegion}" """ } From 86f2971f7c620524d067e61f249a193882f924d7 Mon Sep 17 00:00:00 2001 From: Jocelyn Penington Date: Wed, 9 Jul 2025 15:50:40 +1000 Subject: [PATCH 7/7] genesROI.R: specify dplyr::filter --- Rtools/malDrugR/genesROI.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Rtools/malDrugR/genesROI.R b/Rtools/malDrugR/genesROI.R index 1374112..e9170c2 100644 --- a/Rtools/malDrugR/genesROI.R +++ b/Rtools/malDrugR/genesROI.R @@ -27,7 +27,7 @@ pfCurrRefs <- data.frame( strain = c("3D7", "Dd2", "Supp"), version = c("52", "57", "52") ) -ref <- filter(pfCurrRefs, strain == argv$refstrain) +ref <- dplyr::filter(pfCurrRefs, strain == argv$refstrain) ## Not looking for regions of interest in supplementary sequences yet # ---------- Read genomic features -------------------------------------------- @@ -48,9 +48,9 @@ pf_features <- tryCatch( # ---------- Find features in region of interest ------------------------------ chrname <- str_remove(argv$region, ":.+") gstart <- str_remove(argv$region, ".+:") |> - str_remove("-.+") + str_remove("-.+") |> as.numeric() gend <- str_remove(argv$region, ".+:") |> - str_remove(".+-") + str_remove(".+-") |> as.numeric() roigr <- GRanges( seqnames = levels(seqnames(pf_features)) |> str_subset(paste0("Pf", ref$strain, "_", chrname)),