Merge pull request #71 from mskcc/dev

nikhil · web-flow · commit 56a14111edfc · 2025-02-13T10:05:19.000-05:00
Release 1.2
diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Download lint results
-        uses: dawidd6/action-download-artifact@80620a5d27ce0ae443b965134db88467fc607b43 # v7
+        uses: dawidd6/action-download-artifact@20319c5641d495c8a52e688b7dc5fada6c3a9fbc # v8
         with:
           workflow: linting.yml
           workflow_conclusion: completed
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -19,7 +19,7 @@ lint:
   multiqc_config:
     - report_comment
   nextflow_config: false
-nf_core_version: 3.1.2
+nf_core_version: 3.2.0
 repository_type: pipeline
 template:
   author: Nikhil Kumar
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
           - prettier@3.2.5
 
   - repo: https://github.com/editorconfig-checker/editorconfig-checker.python
-    rev: "3.0.3"
+    rev: "3.1.2"
     hooks:
       - id: editorconfig-checker
         alias: ec
diff --git a/README.md b/README.md
@@ -36,7 +36,6 @@ sample,maf,facets_hisens_cncf,hla_file
 tumor_normal,temp_test_somatic_unfiltered.maf,facets_hisens.cncf.txt,winners.hla.txt
 tumor_normal2,temp_test_somatic_unfiltered.maf,facets_hisens.cncf.txt,winners.hla.txt
 ```
--->
 
 Now, you can run the pipeline using:
 
diff --git a/conf/modules.config b/conf/modules.config
@@ -28,4 +28,8 @@ process {
     withName: 'PHYLOWGS_WRITERESULTS' {
         ext.args = '--max-multiprimary 1.0'
     }
+
+    withName: 'NEOANTIGENUTILS_NEOANTIGENINPUT' {
+        ext.args = "--kD_cutoff ${params.kd_cutoff}"
+    }
 }
diff --git a/conf/prod.config b/conf/prod.config
@@ -36,6 +36,7 @@ params {
     phylo_burnin_samples = 1000
     phylo_mcmc_samples = 2500
     phylo_num_chains = 15
+    kd_cutoff = 500
 
     iedbfasta = 'https://raw.githubusercontent.com/mskcc/NeoantigenEditing/refs/heads/main/data/iedb.fasta'
     cds = 'https://github.com/mskcc-omics-workflows/test-datasets/raw/neoantigen/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz'
diff --git a/conf/test.config b/conf/test.config
@@ -32,6 +32,7 @@ params {
     phylo_mcmc_samples = 2
     phylo_num_chains = 2
     netmhc3 = true
+    kd_cutoff = 500
 
     iedbfasta = 'https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/neoantigenEditing/data/iedb.fasta'
     cds = 'https://github.com/mskcc-omics-workflows/test-datasets/raw/neoantigen/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz'
diff --git a/docs/output.md b/docs/output.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline.
+This document describes the output produced by the neoantigen pipeline.
 
 The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 
diff --git a/modules.json b/modules.json
@@ -37,7 +37,7 @@
                     },
                     "neoantigenutils/neoantigeninput": {
                         "branch": "develop",
-                        "git_sha": "003587a171d6cfa80bc894950d212add9f206f88",
+                        "git_sha": "ba014f40a3aaccd6a78db44f62d697b77a790eb8",
                         "installed_by": ["modules"]
                     },
                     "netmhc3": {
@@ -107,7 +107,7 @@
                 "nf-core": {
                     "multiqc": {
                         "branch": "master",
-                        "git_sha": "f80914f78fb7fa1c00b14cfeb29575ee12240d9c",
+                        "git_sha": "81880787133db07d9b4c1febd152c090eb8325dc",
                         "installed_by": ["modules"]
                     }
                 }
diff --git a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py
@@ -8,7 +8,7 @@
 from Bio.pairwise2 import format_alignment
 import numpy as np
 
-VERSION = 1.8
+VERSION = 1.9
 
 
 def main(args):
@@ -340,7 +340,7 @@ def find_first_difference_index(str1, str2):
     WTdict = {}
     SVWTdict = {}
     for index_WT, row_WT in neoantigen_WT_in.iterrows():
-        noposID = ""
+        no_positon_ID = ""
         id = ""
         wtsvid = ""
         row_WT_identity = trim_id(row_WT["Identity"])
@@ -358,7 +358,7 @@ def find_first_difference_index(str1, str2):
                 + "_"
                 + str(row_WT["pos"])
             )
-            noposID = (
+            no_positon_ID = (
                 IDsplit[0]
                 + "_"
                 + IDsplit[1][0:7]
@@ -372,16 +372,16 @@ def find_first_difference_index(str1, str2):
                 "peptide": row_WT["peptide"],
             }
             id = wtsvid
-            if noposID not in WTdict:
-                WTdict[noposID] = {
+            if no_positon_ID not in WTdict:
+                WTdict[no_positon_ID] = {
                     "peptides": {
                         row_WT["peptide"]: id
                     },  # This is a dict so we can match the peptide with the actual ID later
                     "affinity": row_WT["affinity"],
                 }
 
             else:
-                WTdict[noposID]["peptides"][row_WT["peptide"]] = id
+                WTdict[no_positon_ID]["peptides"][row_WT["peptide"]] = id
 
         else:
             id = (
@@ -394,7 +394,7 @@ def find_first_difference_index(str1, str2):
                 + str(row_WT["pos"])
             )
 
-            noposID = (
+            no_positon_ID = (
                 row_WT_identity[:-2]
                 + "_"
                 + str(len(row_WT["peptide"]))
@@ -404,16 +404,16 @@ def find_first_difference_index(str1, str2):
             WTdict[id] = {"affinity": row_WT["affinity"], "peptide": row_WT["peptide"]}
 
             # This is used as last resort for the matching.  We will preferentially find the peptide matching in length as well as POS. Worst case we will default to the WT pos 0
-            if noposID not in WTdict:
-                WTdict[noposID] = {
+            if no_positon_ID not in WTdict:
+                WTdict[no_positon_ID] = {
                     "peptides": {
                         row_WT["peptide"]: id
                     },  # This is a dict so we can match the peptide with the ID later
                     "affinity": row_WT["affinity"],
                 }
 
             else:
-                WTdict[noposID]["peptides"][row_WT["peptide"]] = id
+                WTdict[no_positon_ID]["peptides"][row_WT["peptide"]] = id
 
     def find_most_similar_string(target, strings):
         max_score = -1
@@ -457,9 +457,10 @@ def find_most_similar_string(target, strings):
         row_MUT_identity = trim_id(row_mut["Identity"])
         IDsplit = row_MUT_identity.split("_")
         SV = False
-        if row_mut["affinity"] < 500:
+        if row_mut["affinity"] < float(args.kD_cutoff):
             peplen = len(row_mut["peptide"])
             matchfound = False
+            frameshift= False
             if IDsplit[1][0] == "S" and IDsplit[1][1] != "p":
                 # If it is a silent mutation.  Silent mutations can either be S or SY. These include intron mutations.  Splices can be Sp
                 continue
@@ -476,7 +477,7 @@ def find_most_similar_string(target, strings):
                     + "_"
                     + str(row_mut["pos"])
                 )
-                noposID = (
+                no_positon_ID = (
                     IDsplit[0]
                     + "_"
                     + IDsplit[1][0:8]
@@ -500,46 +501,40 @@ def find_most_similar_string(target, strings):
                     + "_"
                     + str(row_mut["pos"])
                 )
-                noposID = (
+                no_positon_ID = (
                     row_MUT_identity[:-2]
                     + "_"
                     + str(peplen)
                     + "_"
                     + row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "")
                 )
             if (
-                WTid in WTdict
-                and ("M" == IDsplit[1][0] and "Sp" not in row_MUT_identity)
+                 ("M" == IDsplit[1][0] and "Sp" not in row_MUT_identity)
                 or SV == False
             ):
                 # match
-                matchfound = True
-                best_pepmatch = WTdict[WTid]["peptide"]
-                frameshift = False
-
-            else:
                 if (
-                    "-" in row_MUT_identity
-                    or "+" in row_MUT_identity
-                    and WTid in WTdict
-                    or SV == False
-                ):
-                    # Means there is a frame shift and we don't need to do a analysis of 5' end and 3' end as 3' end is no longer recognizeable/comparable to the WT sequence at all
-                    # We can just move the windows along together. There will likely be little to no match with the WT peptides.
+                    (WTid in WTdict)
+                    and IDsplit[1][0] != "I"
+                    ):
+                    #This block takes care of Missense mutations caused by polymorphisims
                     matchfound = True
                     best_pepmatch = WTdict[WTid]["peptide"]
-                    frameshift = False
+                    
                 else:
-                    # Here we take care of frameshifted peptides
-                    frameshift = True
+                    # Here we take care of INDELS and everything else
+
+                    if ("-" in IDsplit[1] or "+" in IDsplit[1]):
+                        frameshift = True
+
                     (
                         best_pepmatch,
                         best_pepmatch2,
                         first_AA_same,
                         first_AA_same_score,
                         match_score,
                     ) = find_most_similar_string(
-                        row_mut["peptide"], list(WTdict[noposID]["peptides"].keys())
+                        row_mut["peptide"], list(WTdict[no_positon_ID]["peptides"].keys())
                     )
                     if (
                         best_pepmatch == row_mut["peptide"]
@@ -556,14 +551,14 @@ def find_most_similar_string(target, strings):
                         best_pepmatch[-1] != row_mut["peptide"][-1]
                         and best_pepmatch2[-1] == row_mut["peptide"][-1]
                     ):
-                        # We should preferentially match the first AA if we can.  I have found that the pairwise alignment isnt always the best at this.
+                        # We should preferentially match the first AA if we can.  Sometimes the pairwise alignment isnt the best at this so we do a little check here.
                         # It will also do this when the last AA of the best match doesnt match but the last A of the second best match does
                         best_pepmatch = best_pepmatch2
 
-                    WTid = WTdict[noposID]["peptides"][best_pepmatch]
+                    WTid = WTdict[no_positon_ID]["peptides"][best_pepmatch]
                     matchfound = True
 
-            if matchfound == True:
+            if matchfound == True and best_pepmatch != row_mut["peptide"]:
                 mut_pos = (
                     find_first_difference_index(
                         row_mut["peptide"], best_pepmatch  # WTdict[WTid]["peptide"]
@@ -934,6 +929,10 @@ def parse_args():
         "-v", "--version", action="version", version="%(prog)s {}".format(VERSION)
     )
 
+    parser.add_argument(
+        "--kD_cutoff", default=500, help="Cutoff value for the kD, default is 500",
+    )
+
     return parser.parse_args()
 
 
@@ -948,6 +947,7 @@ def parse_args():
     print("Cohort:", args.cohort)
     print("HLA Genes File:", args.HLA_genes)
     print("netMHCpan Files:", args.netMHCpan_MUT_input, args.netMHCpan_WT_input)
+    print("kD Cutoff Value:", args.kD_cutoff)
     if args.patient_data_file:
         print("patient_data_file File:", args.patient_data_file)
 
diff --git a/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test.snap b/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test.snap
@@ -29,10 +29,10 @@
             }
         ],
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "24.04.4"
+            "nf-test": "0.8.4",
+            "nextflow": "24.04.2"
         },
-        "timestamp": "2024-11-29T15:37:10.311539"
+        "timestamp": "2025-02-05T14:38:57.427385617"
     },
     "neoantigenutils_neoantigeninput - bedpe,json,tsv": {
         "content": [
@@ -43,7 +43,7 @@
                             "id": "test",
                             "single_end": false
                         },
-                        "test_patient_test_.json:md5,401207e1ed3fb2708291a8eeff5efcd7"
+                        "test_patient_test_.json:md5,7fdb25ccc6ed41f53aa03507d982ea05"
                     ]
                 ],
                 "1": [
@@ -55,7 +55,7 @@
                             "id": "test",
                             "single_end": false
                         },
-                        "test_patient_test_.json:md5,401207e1ed3fb2708291a8eeff5efcd7"
+                        "test_patient_test_.json:md5,7fdb25ccc6ed41f53aa03507d982ea05"
                     ]
                 ],
                 "versions": [
@@ -64,10 +64,10 @@
             }
         ],
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "24.04.4"
+            "nf-test": "0.8.4",
+            "nextflow": "24.04.2"
         },
-        "timestamp": "2024-11-29T15:36:59.107384"
+        "timestamp": "2025-02-05T14:38:35.2939696"
     },
     "neoantigenutils_neoantigeninput - json,tsv": {
         "content": [
@@ -78,7 +78,7 @@
                             "id": "test",
                             "single_end": false
                         },
-                        "test_patient_test_.json:md5,f9db7a487cbd4aad167819d885a9a9e3"
+                        "test_patient_test_.json:md5,a81ee3977fa393850e0b7b36321d1143"
                     ]
                 ],
                 "1": [
@@ -90,7 +90,7 @@
                             "id": "test",
                             "single_end": false
                         },
-                        "test_patient_test_.json:md5,f9db7a487cbd4aad167819d885a9a9e3"
+                        "test_patient_test_.json:md5,a81ee3977fa393850e0b7b36321d1143"
                     ]
                 ],
                 "versions": [
@@ -99,9 +99,9 @@
             }
         ],
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "24.04.4"
+            "nf-test": "0.8.4",
+            "nextflow": "24.04.2"
         },
-        "timestamp": "2024-11-29T15:37:06.847468"
+        "timestamp": "2025-02-05T14:38:48.778108545"
     }
 }
diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml
diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap
diff --git a/nextflow.config b/nextflow.config
diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json

Original file line number	Diff line number	Diff line change
`@@ -28,4 +28,8 @@ process {`
`28`	`28`	`withName: 'PHYLOWGS_WRITERESULTS' {`
`29`	`29`	`ext.args = '--max-multiprimary 1.0'`
`30`	`30`	`}`
	`31`	`+`
	`32`	`+ withName: 'NEOANTIGENUTILS_NEOANTIGENINPUT' {`
	`33`	`+ ext.args = "--kD_cutoff ${params.kd_cutoff}"`
	`34`	`+ }`
`31`	`35`	`}`