diff --git a/conf/test_genotyping.config b/conf/test_genotyping.config index 6313eb23..222510be 100644 --- a/conf/test_genotyping.config +++ b/conf/test_genotyping.config @@ -27,4 +27,7 @@ params { // Genotyping genotyping = true + single_clone_representative = true + // Skip clonal analysis + skip_clonal_analysis = true } diff --git a/modules/local/enchantr/bayesian_genotype_inference.nf b/modules/local/enchantr/bayesian_genotype_inference.nf index fb8ad57f..eea65b06 100644 --- a/modules/local/enchantr/bayesian_genotype_inference.nf +++ b/modules/local/enchantr/bayesian_genotype_inference.nf @@ -27,12 +27,10 @@ process BAYESIAN_GENOTYPE_INFERENCE { container "docker.io/immcantation/airrflow:genotyping" input: - tuple val(meta), path(tabs) // meta, sequence tsv in AIRR format - path reference_fasta - path repertoires_samplesheet + tuple val(meta), path(tabs), path(reference_fasta) // meta, sequence tsv in AIRR format output: - path("*/*/db_genotype"), emit: reference // reference folder + tuple val(meta), path("*_report/references/*/db_genotype"), emit: reference // reference folder path("*/*_command_log.txt"), emit: logs //process logs path "*_report" path "versions.yml", emit: versions @@ -40,19 +38,14 @@ process BAYESIAN_GENOTYPE_INFERENCE { script: def args = task.ext.args ? asString(task.ext.args) : '' - def input = "" - if (repertoires_samplesheet) { - input = repertoires_samplesheet - } else { - input = tabs.join(',') - } + def input = tabs.join(',') """ Rscript -e "enchantr::enchantr_report('tigger_bayesian_genotype', \\ report_params=list('input'='${input}', \\ 'imgt_db'='${reference_fasta}', \\ 'species'='auto', \\ 'genotypeby'='${params.genotypeby}', \\ - 'force'=FALSE, \\ + 'single_clone_representative'='${params.single_clone_representative}', \\ 'outdir'=getwd(), \\ 'log'='${meta.id}_bayesian_genotype_inference_command_log' ${args}))" diff --git a/modules/local/enchantr/novel_allele_inference.nf b/modules/local/enchantr/novel_allele_inference.nf index 2d5b2d9a..926fe5b5 100644 --- a/modules/local/enchantr/novel_allele_inference.nf +++ b/modules/local/enchantr/novel_allele_inference.nf @@ -27,12 +27,10 @@ process NOVEL_ALLELE_INFERENCE { container "docker.io/immcantation/airrflow:genotyping" input: - tuple val(meta), path(tabs) // meta, sequence tsv in AIRR format - path reference_fasta - path repertoires_samplesheet + tuple val(meta), path(tabs), path(reference_fasta) // meta, sequence tsv in AIRR format, reference fasta output: - path("*/*/db_novel"), emit: reference // reference folder + tuple val(meta), path("*_report/db_novel"), emit: reference // reference folder path("*/*_command_log.txt"), emit: logs //process logs path "*_report", optional: true, emit: report path "versions.yml", emit: versions @@ -40,18 +38,12 @@ process NOVEL_ALLELE_INFERENCE { script: def args = task.ext.args ? asString(task.ext.args) : '' - def input = "" - if (repertoires_samplesheet) { - input = repertoires_samplesheet - } else { - input = tabs.join(',') - } + def input = tabs.join(',') """ Rscript -e "enchantr::enchantr_report('novel_allele_inference', \\ report_params=list('input'='${input}', \\ 'imgt_db'='${reference_fasta}', \\ 'species'='auto', \\ - 'force'=FALSE, \\ 'outdir'=getwd(), \\ 'nproc'=${task.cpus}, \\ 'log'='${meta.id}_novel_allele_inference_command_log' ${args}))" diff --git a/modules/local/enchantr/reassign_alleles.nf b/modules/local/enchantr/reassign_alleles.nf index e17e9acb..ec4d07a9 100644 --- a/modules/local/enchantr/reassign_alleles.nf +++ b/modules/local/enchantr/reassign_alleles.nf @@ -27,15 +27,14 @@ process REASSIGN_ALLELES { container "docker.io/immcantation/airrflow:genotyping" input: - tuple val(meta), path(tabs) // meta, sequence tsv in AIRR format - path reference_fasta - path repertoires_samplesheet + tuple val(meta), path(tabs), path(reference_fasta) // meta, sequence tsv in AIRR format, reference fasta val segments // which segments to reassign alleles to + val outputby // which field to use for output //TODO: did we want to handle all segments at once? Then this val channel would not be needed. - + // *After novel alleles we just need to change the V, it's a time waste to go over all segments. + //TODO: Check if we need the outputby parameter. Right now this is the same as the genotypeby parameter. output: - path("*/*/db_genotype"), emit: reference // reference folder - path("*/*_reassigned.tsv"), emit: repertoires // reassigned repertoire + tuple val(meta), path("*/*/*reassign-pass.tsv"), emit: tab // reassigned repertoire path("*/*_command_log.txt"), emit: logs //process logs path "*_report" path "versions.yml", emit: versions @@ -44,20 +43,15 @@ process REASSIGN_ALLELES { script: def args = task.ext.args ? asString(task.ext.args) : '' def segs = segments.join(",") - def input = "" - if (repertoires_samplesheet) { - input = repertoires_samplesheet - } else { - input = tabs.join(',') - } + def input = tabs.join(',') + """ Rscript -e "enchantr::enchantr_report('reassign_alleles', \\ report_params=list('input'='${input}', \\ 'imgt_db'='${reference_fasta}', \\ 'species'='auto', \\ - 'outputby'='${params.outputby}', \\ + 'outputby'='${outputby}', \\ 'segments'='${segs}', \\ - 'force'=FALSE, \\ 'outdir'=getwd(), \\ 'log'='${meta.id}_reassign_alleles_command_log' ${args}))" diff --git a/nextflow.config b/nextflow.config index a67338b0..4d4520e5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -97,7 +97,7 @@ params { remove_chimeric = false detect_contamination = false collapseby = 'sample_id' - + // ----------------------- // clonal analysis options // ----------------------- @@ -121,8 +121,10 @@ params { // ----------------------- genotyping = false genotypeby = 'subject_id' + reassignby = 'sample_id' novel_allele_inference = true - + genotype_clone_threshold = '0.2' + single_clone_representative = true // ----------------------- // translate embed options // ----------------------- diff --git a/nextflow_schema.json b/nextflow_schema.json index 364fbff3..a1748858 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -553,6 +553,42 @@ "help_text": "By default, the pipeline will define clones for each of the samples, as two sequences having the same V-gene assignment, C-gene assignment, J-gene assignment, and junction length. Additionally, the similarity of the CDR3 sequences will be assessed by Hamming distances. \n\nA distance threshold for determining if two sequences come from the same clone or not is automatically determined by the process find threshold. Alternatively, a hamming distance threshold can be manually set by setting the `--clonal_threshold` parameter.", "fa_icon": "fab fa-pagelines" }, + "genotyping_and_novel_alleles_options": { + "title": "Genotyping and Novel Alleles options", + "type": "object", + "description": "Options for genotyping and novel allele inference.", + "default": "", + "properties": { + "genotyping": { + "type": "boolean", + "description": "Perform TIgGER genotype inference.", + "fa_icon": "fas fa-dna" + }, + "genotypeby": { + "type": "string", + "default": "subject_id", + "description": "Name of the field used to group data files to infer genotype.", + "fa_icon": "fab fa-pagelines" + }, + "genotype_clone_threshold": { + "type": "string", + "default": "0.2", + "description": "Threshold for determining if two sequences come from the same clone or not.", + "fa_icon": "fas fa-dna" + }, + "single_clone_representative": { + "type": "boolean", + "description": "Perform TIgGER novel allele inference.", + "fa_icon": "fas fa-dna" + }, + "novel_allele_inference": { + "type": "boolean", + "description": "Perform TIgGER novel allele inference.", + "fa_icon": "fas fa-dna" + } + }, + "fa_icon": "fas fa-dna" + }, "translation_and_embedding_options": { "title": "Translation and embedding options", "type": "object", @@ -847,6 +883,9 @@ { "$ref": "#/$defs/clonal_analysis_options" }, + { + "$ref": "#/$defs/genotyping_and_novel_alleles_options" + }, { "$ref": "#/$defs/translation_and_embedding_options" }, diff --git a/subworkflows/local/novel_alleles_and_genotyping.nf b/subworkflows/local/novel_alleles_and_genotyping.nf index 19e92596..85c7b094 100644 --- a/subworkflows/local/novel_alleles_and_genotyping.nf +++ b/subworkflows/local/novel_alleles_and_genotyping.nf @@ -1,68 +1,112 @@ include { NOVEL_ALLELE_INFERENCE } from '../../modules/local/enchantr/novel_allele_inference' include { BAYESIAN_GENOTYPE_INFERENCE } from '../../modules/local/enchantr/bayesian_genotype_inference' include { REASSIGN_ALLELES as REASSIGN_ALLELES_NOVEL; REASSIGN_ALLELES as REASSIGN_ALLELES_GENOTYPE} from '../../modules/local/enchantr/reassign_alleles' - +include { CLONAL_ANALYSIS } from './clonal_analysis.nf' +include { CLONAL_ASSIGNMENT as CLONAL_ASSIGNMENT_COMPUTE } from '../../modules/local/enchantr/clonal_assignment' workflow NOVEL_ALLELES_AND_GENOTYPING { take: ch_repertoire ch_reference_fasta ch_validated_samplesheet + ch_logo main: ch_versions = Channel.empty() ch_logs = Channel.empty() - + def outputby = params.genotypeby=="sample_id" ? "id" : params.genotypeby //TODO: we need to change this so we can handle the cases of inferring based on naive and reassigning all // merge all repertoires by genotypeby metadata field - ch_repertoire.map{ it -> [ it[0]."${params.genotypeby}", - it[0].id, - it[0].subject_id, - it[0].species, - it[0].single_cell, - it[0].locus, - it[1] ] } + ch_repertoire + .combine(ch_reference_fasta) + .map{ it -> + def meta = it[0] + def rep = it[1] + def ref = it[2] + def genotypeby = params.genotypeby=="sample_id" ? "id" : params.genotypeby + [ meta."${genotypeby}", + meta.id, + meta.subject_id, + meta.species, + meta.single_cell, + meta.locus, + rep, + ref ] } .groupTuple() .map{ get_meta_tabs(it) } .set{ ch_grouped_repertoires } - //TODO: conditional on params.novel_allele_inference // infer novel alleles - NOVEL_ALLELE_INFERENCE ( - ch_grouped_repertoires, - ch_reference_fasta, - ch_validated_samplesheet.collect() - ) - - // reassign novel alleles (we can skip this step if no novel alleles were inferred) - - REASSIGN_ALLELES_NOVEL ( - ch_grouped_repertoires, - NOVEL_ALLELE_INFERENCE.out.reference, - ch_validated_samplesheet.collect(), - "segments" //TODO: update this to pass actual segments. - ) - - - // infer genotype (gets the reference from novel alleles inference in any case) - + if (params.novel_allele_inference) { + NOVEL_ALLELE_INFERENCE ( + ch_grouped_repertoires + ) + + // reassign novel alleles (we can skip this step if no novel alleles were inferred) + ch_grouped_repertoires + .join(NOVEL_ALLELE_INFERENCE.out.reference) + .map { it -> + def meta = it[0] + def reps = it[1] + def new_ref = it[3] + [ meta, reps, new_ref ] + } + .set{ ch_for_genotyping } + + REASSIGN_ALLELES_NOVEL ( + ch_for_genotyping, + ["v"], + outputby + ) + + REASSIGN_ALLELES_NOVEL.out.tab + .join(NOVEL_ALLELE_INFERENCE.out.reference) + .set{ ch_for_genotyping } + + + } else { + ch_for_genotyping = ch_grouped_repertoires + } + + if (params.single_clone_representative) { + // TODO: Check if we need the cloneby parameter, or here it can be the same as genotypeby. + // create separate channels for repertoire and reference based on the genotypeby metadata field + ch_for_genotyping + .map{ it -> [it[0], it[1]] } + .set{ ch_for_genotyping_rep } + ch_for_genotyping + .map{ it -> it[2] } + .set{ ch_for_genotyping_ref } + CLONAL_ASSIGNMENT_COMPUTE( + ch_for_genotyping_rep, + [params.genotype_clone_threshold], + ch_for_genotyping_ref, + [] + ) + CLONAL_ASSIGNMENT_COMPUTE.out.tab + .join(ch_for_genotyping + .map{ it -> [it[0], it[2]] }) + .set{ ch_for_genotyping } + } + + // infer genotype BAYESIAN_GENOTYPE_INFERENCE ( - REASSIGN_ALLELES_NOVEL.out.repertoires, - NOVEL_ALLELE_INFERENCE.out.reference, - ch_validated_samplesheet.collect() + ch_for_genotyping ) + + ch_grouped_repertoires + .map{ it -> [it[0], it[1]] } + .join(BAYESIAN_GENOTYPE_INFERENCE.out.reference) + .set{ ch_for_reassign } - // reassign genotypes (gets the reference from genotype inference in any case) - + // reassign genotypes REASSIGN_ALLELES_GENOTYPE ( - REASSIGN_ALLELES_NOVEL.out.repertoires, - BAYESIAN_GENOTYPE_INFERENCE.out.reference, - ch_validated_samplesheet.collect(), - "segments" //TODO: update this to pass actual segments. + ch_for_reassign, + ["auto"], + outputby ) - emit: - repertoire = ch_repertoire + repertoire = REASSIGN_ALLELES_GENOTYPE.out.tab versions = ch_versions logs = ch_logs } @@ -79,7 +123,6 @@ def get_meta_tabs(arr) { def array = [] - array = [ meta, arr[6].flatten() ] - + array = [ meta, arr[6].flatten(), arr[7][0] ] return array } diff --git a/workflows/airrflow.nf b/workflows/airrflow.nf index 30abfb0b..037d0f55 100644 --- a/workflows/airrflow.nf +++ b/workflows/airrflow.nf @@ -247,19 +247,21 @@ workflow AIRRFLOW { ch_repertoires_after_qc = ch_bulk_filtered .mix(SINGLE_CELL_QC_AND_FILTERING.out.repertoires) - // Novel allele inference and genotyping + // TODO: for now clonal analysis and genotyping are independent, + // but once genotyping is implemented the personalized reference should be used for clonal analysis + // when genotyping is performed. + + // Novel alleles and genotype inference if (params.genotyping) { NOVEL_ALLELES_AND_GENOTYPING( ch_repertoires_after_qc, VDJ_ANNOTATION.out.reference_fasta.collect(), - ch_validated_samplesheet.collect() + ch_validated_samplesheet.collect(), + ch_report_logo_img.collect().ifEmpty([]) ) + ch_versions = ch_versions.mix( NOVEL_ALLELES_AND_GENOTYPING.out.versions ) } - // TODO: for now clonal analysis and genotyping are independent, - // but once genotyping is implemented the personalized reference should be used for clonal analysis - // when genotyping is performed. - // Clonal analysis if (!params.skip_clonal_analysis) { CLONAL_ANALYSIS(