nf-core
diff --git a/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎assets/rank_model_genmod_gicam.ini‎
Lines changed: 121 additions & 0 deletions b/‎assets/rank_model_genmod_gicam.ini‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎conf/modules/rank_variants.config‎
Lines changed: 12 additions & 0 deletions b/‎conf/modules/rank_variants.config‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/output.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/output.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/usage.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/usage.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎main.nf‎
Lines changed: 6 additions & 0 deletions b/‎main.nf‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎modules/local/gicam/main.nf‎
Lines changed: 33 additions & 0 deletions b/‎modules/local/gicam/main.nf‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎modules/local/gicam/meta.yml‎
Lines changed: 78 additions & 0 deletions b/‎modules/local/gicam/meta.yml‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎modules/local/gicam/tests/main.nf.test‎
Lines changed: 28 additions & 0 deletions b/‎modules/local/gicam/tests/main.nf.test‎
Lines changed: 28 additions & 0 deletions
@@ -32,6 +32,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Parameters `save_all_mapped_as_cram` and `save_noalt_mapped_as_cram` to replace `save_mapped_as_cram`, allowing independent control over publishing unfiltered and alt-filtered alignment files as CRAM [#807](https://github.com/nf-core/raredisease/pull/807)
 - Parameter `run_vcfanno_db_sanity_check` to check vcfanno database files for zero records and remove the corresponding annotation blocks from the TOML config before running vcfanno [#821](https://github.com/nf-core/raredisease/pull/821)
 - Added `--skip_split_multiallelics` parameter to allow users to skip the `bcftools norm --multiallelics -both` step in SNV calling (DeepVariant and Sentieon), which can cause indel quality degradation in single-interval runs [#823](https://github.com/nf-core/raredisease/pull/823)
+- Add SNV scoring by MIVMIR, GICAM models [#812](https://github.com/nf-core/raredisease/pull/812)
 
 ### `Changed`
 
 
@@ -175,7 +175,7 @@ For more details about the output files and reports, please refer to the
 
 nf-core/raredisease was written in a collaboration between the Clinical Genomics nodes in Sweden, with major contributions from [Ramprasad Neethiraj](https://github.com/ramprasadn), [Anders Jemt](https://github.com/jemten), [Lucia Pena Perez](https://github.com/Lucpen), and [Mei Wu](https://github.com/projectoriented) at Clinical Genomics Stockholm.
 
-Additional contributors were [Sima Rahimi](https://github.com/sima-r), [Gwenna Breton](https://github.com/Gwennid) and [Emma Västerviga](https://github.com/EmmaCAndersson) (Clinical Genomics Gothenburg); [Halfdan Rydbeck](https://github.com/hrydbeck) and [Lauri Mesilaakso](https://github.com/ljmesi) (Clinical Genomics Linköping); [Subazini Thankaswamy Kosalai](https://github.com/sysbiocoder) (Clinical Genomics Örebro); [Annick Renevey](https://github.com/rannick), [Peter Pruisscher](https://github.com/peterpru) and [Eva Caceres](https://github.com/fevac) (Clinical Genomics Stockholm); [Ryan Kennedy](https://github.com/ryanjameskennedy) (Clinical Genomics Lund); [Anders Sune Pedersen](https://github.com/asp8200) (Danish National Genome Center) and [Lucas Taniguti](https://github.com/lmtani).
+Additional contributors were [Sima Rahimi](https://github.com/sima-r), [Gwenna Breton](https://github.com/Gwennid) and [Emma Västerviga](https://github.com/EmmaCAndersson) (Clinical Genomics Gothenburg); [Halfdan Rydbeck](https://github.com/hrydbeck) and [Lauri Mesilaakso](https://github.com/ljmesi) (Clinical Genomics Linköping); [Subazini Thankaswamy Kosalai](https://github.com/sysbiocoder) (Clinical Genomics Örebro); [Annick Renevey](https://github.com/rannick), [Peter Pruisscher](https://github.com/peterpru), [Eva Caceres](https://github.com/fevac) and [Tor Björgen](https://github.com/torbjorgen) (Clinical Genomics Stockholm); [Ryan Kennedy](https://github.com/ryanjameskennedy) (Clinical Genomics Lund); [Anders Sune Pedersen](https://github.com/asp8200) (Danish National Genome Center) and [Lucas Taniguti](https://github.com/lmtani).
 
 We thank the nf-core community for their extensive assistance in the development of this pipeline.
 
 
@@ -0,0 +1,121 @@
+[Version]
+  version = 1
+  name = rank_model_for_gicam
+
+[Categories]
+
+ [[inheritance_models]]
+   category_aggregation = min
+
+ [[variant_call_quality_filter]]
+   category_aggregation = sum
+
+[model_score]
+  category = variant_call_quality_filter
+  data_type = integer
+  description = Inheritance model score
+  field = INFO
+  info_key = ModelScore
+  record_rule = min
+  separators = ',',':',
+
+  [[not_reported]]
+    score = 0
+
+  [[low_qual]]
+    score = -5
+    lower = 0
+    upper = 10
+
+  [[medium_qual]]
+    score = -2
+    lower = 10
+    upper = 20
+
+  [[high_qual]]
+    score = 0
+    lower = 20
+    upper = 300
+
+[genetic_models]
+  category = inheritance_models
+  data_type = string
+  description = Inheritance models followed for the variant
+  field = INFO
+  info_key = GeneticModels
+  record_rule = max
+  separators = ',', ':', '|',
+
+ [[ad]]
+   priority = 1
+   score = 1
+   string = 'AD'
+
+ [[ad_dn]]
+   score = 1
+   priority = 1
+   string = 'AD_dn'
+
+ [[ar]]
+   score = 1
+   priority = 1
+   string = 'AR_hom'
+
+ [[ar_dn]]
+   score = 1
+   priority = 1
+   string = 'AR_hom_dn'
+
+ [[ar_comp]]
+   score = 1
+   priority = 1
+   string = 'AR_comp'
+
+ [[ar_comp_dn]]
+   score = 1
+   priority = 1
+   string = 'AR_comp_dn'
+
+ [[xr]]
+   score = 1
+   priority = 1
+   string = 'XR'
+
+ [[xr_dn]]
+   score = 1
+   priority = 1
+   string = 'XR_dn'
+
+ [[xd]]
+   score = 1
+   priority = 1
+   string = 'XD'
+
+ [[xd_dn]]
+   score = 1
+   priority = 1
+   string = 'XD_dn'
+
+ [[not_reported]]
+   score = -12
+
+[filter]
+  category = variant_call_quality_filter
+  data_type = string
+  description = The filters for the variant
+  field = FILTER
+  record_rule = min
+  separators = ';',
+
+  [[not_reported]]
+    score = 0
+
+  [[pass]]
+    score = 3
+    priority = 1
+    string = 'PASS'
+
+  [[dot]]
+    score = 3
+    priority = 2
+    string = '.'
@@ -83,6 +83,18 @@ process {
         ext.prefix = { "${meta.id}_snv_ranked_${meta.set}" }
     }
 
+    withName: '.*RANK_VARIANTS_SNV:TABIX_BGZIPTABIX_GICAM' {
+        ext.prefix = { "${meta.id}_snv_ranked_gicam_${meta.set}" }
+    }
+
+    withName: '.*RANK_VARIANTS_SNV:TABIX_BGZIPTABIX_GENMOD_GICAM' {
+        ext.prefix = { "${meta.id}_snv_ranked_${meta.set}" }
+    }
+
+    withName: '.*RANK_VARIANTS_SNV:BCFTOOLS_MERGE_GENMOD_GICAM' {
+        ext.args   = { "--columns MivmirScore,MivmirExplanation,GicamScore" }
+        ext.prefix = { "${meta.id}_snv_ranked_${meta.set}" }
+    }
 }
 
 //
 
@@ -69,6 +69,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
     - [Filtering and ranking](#filtering-and-ranking)
       - [Filter_vep](#filter_vep)
       - [GENMOD](#genmod)
+      - [MIVMIR, GICAM](#mivmir-gicam)
     - [Mobile element analysis](#mobile-element-analysis)
       - [Calling mobile elements](#calling-mobile-elements)
       - [Annotating mobile elements](#annotating-mobile-elements)
@@ -527,6 +528,15 @@ We recommend using vcfanno to annotate SNVs with precomputed CADD scores (files
 
 </details>
 
+#### MIVMIR, GICAM
+
+[MIVMIR](../modules/local/mivmir/meta.yml) and [GICAM](../modules/local/gicam/meta.yml) are two machine learning models used to
+infer a pathogenicity score for SNVs. In essence, MIVMIR infer SNV pathogenicity and GICAM improves precision for
+duo, trio, ... analysis. MIVMIR, GICAM can be enabled by setting the `--rank_with_mivmir_gicam` feature flag and
+adds annotations `INFO/MivmirScore`, `INFO/MivmirExplanation`, `INFO/GicamScore`.
+Only `<case_id>_snv_ranked_<research|clinical>.vcf.gz` contains the above annotations.
+Refer to the module documentation `.yml` for more information on required inputs and output formats.
+
 ### Mobile element analysis
 
 #### Calling mobile elements
 
@@ -290,7 +290,7 @@ The mandatory and optional parameters for each category are tabulated below.
 | vcfanno_toml<sup>3</sup>             | vep_filters/vep_filters_scout_fmt<sup>10</sup> |
 | vep_cache_version                    | cadd_resources<sup>11</sup>                    |
 | vep_cache<sup>4</sup>                | run_vcfanno_db_sanity_check<sup>12</sup>       |
-| gnomad_af<sup>5</sup>                |                                                |
+| gnomad_af<sup>5</sup>                | rank_with_mivmir_gicam<sup>13</sup>            |
 | score_config_snv<sup>6</sup>         |                                                |
 | variant_consequences_snv<sup>7</sup> |                                                |
 | vep_plugin_files<sup>8</sup>         |                                                |
@@ -310,6 +310,7 @@ no header and the following columns: `CHROM POS REF_ALLELE,ALT_ALLELE AF`. Sampl
 <sup>10</sup> This file contains a list of candidate genes (with [HGNC](https://www.genenames.org/) IDs) that is used to split the variants into candidate variants and research variants. Research variants contain all the variants, while candidate variants are a subset of research variants and are associated with candidate genes. Sample file [here](https://github.com/nf-core/test-datasets/blob/raredisease/reference/hgnc.txt). Not required if `--skip_subworkflows generate_clinical_set` is set. To skip this splitting entirely, add `generate_clinical_set` to `--skip_subworkflows`.<br />
 <sup>11</sup>Path to a folder containing cadd annotations. Equivalent of the data/annotations/ folder described [here](https://github.com/kircherlab/CADD-scripts/#manual-installation), and it is used to calculate CADD scores for small indels. <br />
 <sup>12</sup>When set to `true`, each vcfanno database file listed in `vcfanno_resources` is checked for records (non-header lines). Any database with zero records is removed from the vcfanno TOML config before annotation runs. Default: `false`.<br />
+<sup>13</sup> Enable variant SNV-INDEL scoring using MIVMIR, GICAM machine learning models.
 
 :::note
 We use CADD only to annotate small indels. To annotate SNVs with precomputed CADD scores, pass the file containing CADD scores as a resource to vcfanno instead. Files containing the precomputed CADD scores for SNVs can be downloaded from [here](https://cadd.gs.washington.edu/download) (download files listed under the description: "All possible SNVs of GRCh3<7/8>/hg3<7/8>")
 
@@ -106,6 +106,7 @@ workflow NFCORE_RAREDISEASE {
     val_readcount_intervals
     val_reduced_penetrance
     val_rtg_truthvcfs
+    val_rank_with_mivmir_gicam
     val_run_mt_for_wes
     val_run_rtgvcfeval
     val_run_vcfanno_db_sanity_check
@@ -217,6 +218,8 @@ workflow NFCORE_RAREDISEASE {
     ch_score_config_mt          = channelFromPath(val_score_config_mt, true)
     ch_score_config_snv         = channelFromPath(val_score_config_snv, true)
     ch_score_config_sv          = channelFromPath(val_score_config_sv, true)
+    // ch_genmod_gicam_score_config is integral to GICAM inference; it cannot be changed without retraining gicam
+    ch_score_config_genmod_gicam = channel.fromPath("$projectDir/assets/rank_model_genmod_gicam.ini", checkIfExists: true).collect()
     ch_vcf2cytosure_blacklist   = channelFromPath(val_vcf2cytosure_blacklist, true)
     ch_vcfanno_lua              = channelFromPath(val_vcfanno_lua, true)
     ch_vcfanno_toml             = channelFromPath(val_vcfanno_toml, true)
@@ -434,6 +437,7 @@ workflow NFCORE_RAREDISEASE {
         ch_score_config_mt,
         ch_score_config_snv,
         ch_score_config_sv,
+        ch_score_config_genmod_gicam,
         ch_sdf,
         ch_sentieon_pcr_indel_model,
         ch_subdepth,
@@ -511,6 +515,7 @@ workflow NFCORE_RAREDISEASE {
         val_mt_subsample_rd,
         val_mt_subsample_seed,
         val_platform,
+        val_rank_with_mivmir_gicam,
         val_run_mt_for_wes,
         val_run_rtgvcfeval,
         val_run_vcfanno_db_sanity_check,
@@ -624,6 +629,7 @@ workflow {
         params.readcount_intervals,
         params.reduced_penetrance,
         params.rtg_truthvcfs,
+        params.rank_with_mivmir_gicam,
         params.run_mt_for_wes,
         params.run_rtgvcfeval,
         params.run_vcfanno_db_sanity_check,
 
@@ -0,0 +1,33 @@
+process GICAM_INFER {
+    // https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/gicam
+
+    tag "${meta.id}"
+    label 'process_high'
+
+    container "docker.io/clinicalgenomics/rdds_mivmir:v1.12.0-rc6"
+
+    beforeScript "mkdir ${task.workDir}/rdds-tmp"
+    afterScript "rm -r ${task.workDir}/rdds-tmp"
+    containerOptions {[
+        workflow.containerEngine.equals("singularity") ? "--bind ${task.workDir}/rdds-tmp:/rdds/tmp" : "",
+        workflow.containerEngine.equals("docker") ? "--tmpfs /rdds/tmp": "",
+        ""
+    ].minus("").join(" ")}
+
+    input:
+    tuple val(meta), path(input_vcf)
+
+    output:
+    tuple val(meta), path('*-predictions.vcf'), emit: vcf
+    tuple val("${task.process}"), val('gicam'), val('v1.12.0-rc6'), topic: versions, emit: versions_gicam
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    """
+    . /opt/pyenv/bin/activate
+    export PYTHONPATH=/rdds/src
+    python3 -m rdds.gicam infer-vcf --cpu_cores ${task.cpus} ${input_vcf}
+    """
+}
@@ -0,0 +1,78 @@
+name: gicam
+description: "Machine learning tool to improve precision for duo, trio, ... analysis"
+keywords:
+  - score
+  - ranking
+  - gicam
+tools:
+  - mivmir:
+      description: Model for SNV ranking in conjunction with MIVMIR.
+        This model improves precision for duo, trio, ... analysis situations by reducing MIVMIR scores for variants that's
+        not following the appropriate GENMOD genetic inheritance model.
+        Applied to MIVMIR scores as a post-processing step.
+
+        VCF key inputs to the model are
+        - MivmirScore, (0, 1)
+        - RankScoreNormalized, (0, 1) as produced by GENMOD using rank_model_genmod_gicam.ini custom scoring config
+
+        The tool adds one key to the VCF
+        - GicamScore (0, 1) where 1.0 inferred pathogenic.
+
+        !NOTE! GICAM is optimized for the GENMOD scoring config present in this directory, that
+        generates RankScoreNormalized. Changing the GENMOD scoring config will break inference
+        (unless GICAM is first retrained on the new config).
+      homepage: https://github.com/clinicalgenomics/rdds
+      documentation: https://github.com/Clinical-Genomics/rdds/tree/master/src/rdds/gicam
+      doi: ""
+      licence: ["MIT"]
+      identifier: ""
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. [ id:'test', single_end:false ]
+    - input_vcf:
+        type: file
+        description: vcf file
+        pattern: "*.{vcf}"
+        ontologies: []
+output:
+  vcf:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test', single_end:false ]
+      - "*-predictions.vcf":
+          type: file
+          description: Scored output VCF file
+          pattern: "*.{vcf}"
+          ontologies: []
+  versions_gicam:
+    - - "${task.process}":
+          type: string
+          description: The process the versions were collected from
+      - "gicam":
+          type: string
+          description: The tool name
+      - "version":
+          type: string
+          description: Tool version
+
+topics:
+  versions:
+    - - "${task.process}":
+          type: string
+          description: The process the versions were collected from
+      - "gicam":
+          type: string
+          description: The tool name
+      - "version":
+          type: eval
+          description: Tool version
+
+authors:
+  - "@torbjorgen"
+maintainers:
+  - "@torbjorgen"
@@ -0,0 +1,28 @@
+nextflow_process {
+
+    name "Test Process GICAM_INFER"
+    script "modules/local/gicam/main.nf"
+    process "GICAM_INFER"
+
+    test("Test GICAM inference on annotated VCF") {
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    // VCF as annotated by rank_variants subworkflow (including mivmir and genmod custom inheritance config), prior to gicam inference call
+                    file(params.pipelines_testdata_base_path + 'testdata/justhusky_snv_gicam.vcf', checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(process.out).match()
+        }
+
+    }
+
+}