From d21a7207d61fb8ba5601a5ea361f3c4e72bf5f43 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Thu, 12 Jun 2025 18:08:21 -0400 Subject: [PATCH 01/45] Changes for metamorpheus #1 --- .github/workflows/benchmark.yml | 4 +- benchmark/benchmark_Metamorpheus.R | 76 ++++++++++++++++++++ benchmark/config.slurm | 2 +- benchmark/metamorpheus_Process.R | 41 +++++++++++ benchmark/scriptController.json | 15 ++++ metamorpheus_code.R | 112 +++++++++++++++++++++++++++++ 6 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 benchmark/benchmark_Metamorpheus.R create mode 100644 benchmark/metamorpheus_Process.R create mode 100644 metamorpheus_code.R diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f8d9dcf0..73a9ce73 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -3,8 +3,8 @@ name: Run Simple R Script on HPC via Slurm on: push: branches: - # - feature/multiple-scripts - - devel + - feature/metamorpheus-scripts + # - devel jobs: Benchmarking-pipeline: diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R new file mode 100644 index 00000000..356a0b6f --- /dev/null +++ b/benchmark/benchmark_Metamorpheus.R @@ -0,0 +1,76 @@ +library(MSstatsConvert) +library(MSstats) +library(tidyverse) + +source("metamorpheus_Process.R") + +config <- fromJSON("scriptController.json", simplifyVector = FALSE) + +dataset_config <- config$datasets[["DDA-Solivais2024-Metamorpheus_NoMBR_LFQ"]] +dataset_config <- as.list(dataset_config) + +cat("Processing Dataset:", dataset_config$name, "\n") +cat("Dataset File Path:", dataset_config$file, "\n") + +filePath = dataset_config$parent + "/" + dataset_config$data +annotPath = dataset_config$parent + "/" + +input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) +annot = data.table::fread(file.path(annotPath, "annotation.csv")) + +input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell +input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys + +protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) +protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) + +input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + +output = MetamorpheusToMSstatsFormat(input, annot) + +data_process_tasks <- list( + list( + label = "Data process with Normalized Data", + result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20) + ), + list( + label = "Data process with Normalization and MBImpute False", + result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization", + result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) + ), + list( + label = "Data process without Normalization with MBImpute False", + result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for all features", + result = function() dataProcess(msstats_format, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for top3 features", + result = function() dataProcess(msstats_format, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) + ) +) + +start_time <- Sys.time() + +num_cores <- detectCores() - 1 + +summarized_results <- mclapply(data_process_tasks, function(task) { + list(label = task$label, summarized = task$result()) +}, mc.cores = num_cores) + + +results_list <- mclapply(summarized_results, function(res) { + calculate_Metrics(res$summarized, protein_mappings, res$label) +}, mc.cores = num_cores) + + +final_results <- do.call(rbind, results_list) +end_time <- Sys.time() +total_time <- end_time - start_time +print(final_results) +print(paste("Total Execution Time:", total_time)) \ No newline at end of file diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 28332e1a..054b7f17 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -27,7 +27,7 @@ remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_ remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" -R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R") +R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R", "benchmark_Metamorpheus.R") for script in "${R_SCRIPTS[@]}"; do echo "Executing script: $script" >> job_output.txt diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R new file mode 100644 index 00000000..b1efae31 --- /dev/null +++ b/benchmark/metamorpheus_Process.R @@ -0,0 +1,41 @@ +calculate_Metrics(QuantData, Label, protein_mappings){ + + # dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) + + comparison <- matrix(c(-1,0,0,0,1, # 3x + -1,0,0,1,0, # 2.5x + -1,0,1,0,0, # 2x + -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x + + row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") + groups = levels(QuantData$ProteinLevelData$GROUP) + colnames(comparison) <- groups[order(as.numeric(groups))] + model <- groupComparison(contrast.matrix=comparison, data=QuantData, + use_log_file = FALSE) + + + + ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") + model$ComparisonResult = model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) + + + e_group = model$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) + + ecoli = e_group %>% filter(ecoli == TRUE) + + # hist(ecoli$log2FC) + + ecoli = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) + human = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) + FDR = nrow(human) / (nrow(ecoli) + nrow(human)) + + cat(label, FDR, "\n") + + results <- data.frame( + Label = label, + FDR = fdr + ) + + return(results) + +} \ No newline at end of file diff --git a/benchmark/scriptController.json b/benchmark/scriptController.json index 9b6cbc06..d1a10a78 100644 --- a/benchmark/scriptController.json +++ b/benchmark/scriptController.json @@ -31,6 +31,21 @@ "type": "significant" } } + }, + "DDA-Solivais2024-Metamorpheus_NoMBR_LFQ": { + "name": "DDA-Solivais2024-Metamorpheus_NoMBR_LFQ", + "parent": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current", + "data":"FlashLFQ_NoNormalization_NoPIP", + "samples": { + "Human": { + "pattern": "_HUMAN$", + "type": "insignificant" + }, + "Ecoli": { + "pattern": "_ECOLI$", + "type": "significant" + } + } } } } \ No newline at end of file diff --git a/metamorpheus_code.R b/metamorpheus_code.R new file mode 100644 index 00000000..fa0c12c1 --- /dev/null +++ b/metamorpheus_code.R @@ -0,0 +1,112 @@ +library(MSstatsConvert) +library(MSstats) +library(tidyverse) + + +# No MBR +input_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedPeaks.tsv") + + + +annot_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") + + +input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell +input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys +protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedProteins.tsv") +protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) +input_no_mbr = input_no_mbr %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + + +# input_no_mbr$`Protein Group` = ifelse( +# input_no_mbr$`Protein Group` %in% ecoli$`Protein Groups`, +# paste(input_no_mbr$`Protein Group`, "|ECOLI", sep = ""), +# paste(input_no_mbr$`Protein Group`, "|HUMAN", sep = "")) +# write.csv(input_no_mbr, "QuantifiedPeaks.csv", row.names = FALSE) + + +output_no_mbr = MetamorpheusToMSstatsFormat(input_no_mbr, annot_no_mbr) +QuantData_no_mbr = dataProcess(output_no_mbr, normalization = FALSE) + +dataProcessPlots(QuantData_no_mbr, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) + +comparison <- matrix(c(-1,0,0,0,1, # 3x + -1,0,0,1,0, # 2.5x + -1,0,1,0,0, # 2x + -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x + + +row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") +groups = levels(QuantData_no_mbr$ProteinLevelData$GROUP) +colnames(comparison) <- groups[order(as.numeric(groups))] +model_no_mbr <- groupComparison(contrast.matrix=comparison, data=QuantData_no_mbr, + use_log_file = FALSE) + +library(tidyverse) +ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") +model_no_mbr$ComparisonResult = model_no_mbr$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) + +e_group_no_mbr = model_no_mbr$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) +ecoli_no_mbr = e_group_no_mbr %>% filter(ecoli == TRUE) +hist(ecoli_no_mbr$log2FC) + +ecoli_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) +human_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) +FDR_no_mbr = nrow(human_no_mbr) / (nrow(ecoli_no_mbr) + nrow(human_no_mbr)) + +cat("FDR no MBR", FDR_no_mbr, "\n") + +# With MBR +library(MSstatsConvert) +library(MSstats) +library(tidyverse) +input = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedPeaks.tsv") +annot = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") + +input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell +input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys + +protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedProteins.tsv") +protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) +input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + +# input$`Protein Group` = ifelse( +# input$`Protein Group` %in% ecoli$`Protein Groups`, +# paste(input$`Protein Group`, "|ECOLI", sep = ""), +# paste(input$`Protein Group`, "|HUMAN", sep = "")) +# write.csv(input, "QuantifiedPeaks-MBR.csv", row.names = FALSE) + + +output = MetamorpheusToMSstatsFormat(input, annot) +QuantData = dataProcess(output, normalization = FALSE) + +dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) + +comparison <- matrix(c(-1,0,0,0,1, # 3x + -1,0,0,1,0, # 2.5x + -1,0,1,0,0, # 2x + -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x +row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") +groups = levels(QuantData$ProteinLevelData$GROUP) +colnames(comparison) <- groups[order(as.numeric(groups))] +model <- groupComparison(contrast.matrix=comparison, data=QuantData, + use_log_file = FALSE) + +library(tidyverse) +ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") +model$ComparisonResult = model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) + +e_group = model$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) +ecoli = e_group %>% filter(ecoli == TRUE) +hist(ecoli$log2FC) + +ecoli = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) +human = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) +FDR = nrow(human) / (nrow(ecoli) + nrow(human)) + + +cat("FDR MBR", FDR, "\n") +# FDR no MBR seems to be lower than that of FDR with MBR (except for E-A label), but it's not by a wide margin. +# When normalization was enabled, FDR spiked to 38% without MBR and 58% with MBR. +# When we set adj.pvalue to 0.01, FDR without MBR does better, but not by much. +# Less proteins detected as significant with MBR disabled. \ No newline at end of file From eaee20e470f0d2208baca05244477d11a0e72022 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Thu, 12 Jun 2025 20:31:54 -0400 Subject: [PATCH 02/45] Added changes for metamorpheus no mbr file --- benchmark/benchmark_Metamorpheus.R | 22 +++++++++++++--------- benchmark/metamorpheus_Process.R | 4 ++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 356a0b6f..41c27e59 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -1,6 +1,7 @@ library(MSstatsConvert) library(MSstats) library(tidyverse) +library(parallel) source("metamorpheus_Process.R") @@ -10,14 +11,17 @@ dataset_config <- config$datasets[["DDA-Solivais2024-Metamorpheus_NoMBR_LFQ"]] dataset_config <- as.list(dataset_config) cat("Processing Dataset:", dataset_config$name, "\n") -cat("Dataset File Path:", dataset_config$file, "\n") -filePath = dataset_config$parent + "/" + dataset_config$data -annotPath = dataset_config$parent + "/" +filePath <- file.path(dataset_config$parent, dataset_config$data) +annotPath <- dataset_config$parent input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) annot = data.table::fread(file.path(annotPath, "annotation.csv")) + +cat("Dataset File Path:", filePath, "\n") +cat("Dataset File Path:", annotPath, "\n") + input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys @@ -31,27 +35,27 @@ output = MetamorpheusToMSstatsFormat(input, annot) data_process_tasks <- list( list( label = "Data process with Normalized Data", - result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20) + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20) ), list( label = "Data process with Normalization and MBImpute False", - result = function() dataProcess(msstats_format, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) ), list( label = "Data process without Normalization", - result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) ), list( label = "Data process without Normalization with MBImpute False", - result = function() dataProcess(msstats_format, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) ), list( label = "Data process without Normalization and Imputation On for all features", - result = function() dataProcess(msstats_format, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) + result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) ), list( label = "Data process without Normalization and Imputation On for top3 features", - result = function() dataProcess(msstats_format, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) + result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) ) ) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index b1efae31..9954d3eb 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -1,4 +1,4 @@ -calculate_Metrics(QuantData, Label, protein_mappings){ +calculate_Metrics <- function(QuantData, protein_mappings, label){ # dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) @@ -33,7 +33,7 @@ calculate_Metrics(QuantData, Label, protein_mappings){ results <- data.frame( Label = label, - FDR = fdr + FDR = FDR ) return(results) From b09fae246ab9748b920f7bbf46a609660281494a Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 13 Jun 2025 15:28:12 -0400 Subject: [PATCH 03/45] Explorer Migration Changes #1 --- .github/workflows/benchmark.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 73a9ce73..c3c1c917 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -21,24 +21,24 @@ jobs: mkdir -p ~/.ssh touch ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa - echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa - ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1 + echo "${{ secrets.SSH_PRIVATE_KEY_EXPLORER }}" > ~/.ssh/id_rsa + ssh-keyscan -H login.explorer.northeastern.edu >> ~/.ssh/known_hosts || exit 1 - name: Transfer Files to HPC run: | - scp -O benchmark/benchmark_Dowell2021-HEqe408_LFQ.R benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R benchmark/scriptController.json benchmark/calculateMetrics.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1 + scp -O benchmark/benchmark_Dowell2021-HEqe408_LFQ.R benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R benchmark/scriptController.json benchmark/calculateMetrics.R benchmark/config.slurm raina.ans@login.explorer.northeastern.edu:/work/VitekLab/Projects/Benchmarking || exit 1 - name: Submit Slurm Job and Capture Job ID id: submit_job run: | - ssh raina.ans@login-00.discovery.neu.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt + ssh raina.ans@login.explorer.northeastern.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) echo "Slurm Job ID is $slurm_job_id" echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV - name: Monitor Slurm Job run: | - ssh raina.ans@login-00.discovery.neu.edu " + ssh raina.ans@login.explorer.northeastern.edu " while squeue -j ${{ env.slurm_job_id }} | grep -q ${{ env.slurm_job_id }}; do echo 'Job Id : ${{ env.slurm_job_id }} is still running...' sleep 10 @@ -48,8 +48,8 @@ jobs: - name: Fetch Output run: | - scp -O raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt - scp -O raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt + scp -O raina.ans@login.explorer.northeastern.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt + scp -O raina.ans@login.explorer.northeastern.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt - name: Upload Output as Artifact uses: actions/upload-artifact@v4 From f5927d8a31399fc2189c4c25bc20a03e85d0198d Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 13 Jun 2025 15:35:01 -0400 Subject: [PATCH 04/45] Path correction --- .github/workflows/benchmark.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index c3c1c917..8e4e730a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -26,12 +26,12 @@ jobs: - name: Transfer Files to HPC run: | - scp -O benchmark/benchmark_Dowell2021-HEqe408_LFQ.R benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R benchmark/scriptController.json benchmark/calculateMetrics.R benchmark/config.slurm raina.ans@login.explorer.northeastern.edu:/work/VitekLab/Projects/Benchmarking || exit 1 + scp -O benchmark/benchmark_Dowell2021-HEqe408_LFQ.R benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R benchmark/scriptController.json benchmark/calculateMetrics.R benchmark/config.slurm raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking || exit 1 - name: Submit Slurm Job and Capture Job ID id: submit_job run: | - ssh raina.ans@login.explorer.northeastern.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt + ssh raina.ans@login.explorer.northeastern.edu "cd /projects/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) echo "Slurm Job ID is $slurm_job_id" echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV @@ -48,8 +48,8 @@ jobs: - name: Fetch Output run: | - scp -O raina.ans@login.explorer.northeastern.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt - scp -O raina.ans@login.explorer.northeastern.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt + scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt + scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt - name: Upload Output as Artifact uses: actions/upload-artifact@v4 From 8bd37a9d733e7e925f37a5112bdcf0354727bd57 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 13 Jun 2025 16:14:00 -0400 Subject: [PATCH 05/45] Transfer all benchmark files - Correction --- .github/workflows/benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8e4e730a..eb958b1a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -26,7 +26,7 @@ jobs: - name: Transfer Files to HPC run: | - scp -O benchmark/benchmark_Dowell2021-HEqe408_LFQ.R benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R benchmark/scriptController.json benchmark/calculateMetrics.R benchmark/config.slurm raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking || exit 1 + scp -r benchmark raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking || exit 1 - name: Submit Slurm Job and Capture Job ID id: submit_job From 71f62f46b826ab4ed498c984ee26e65a55a681d7 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 13 Jun 2025 17:46:23 -0400 Subject: [PATCH 06/45] Changes for benchmark folder --- .github/workflows/benchmark.yml | 6 +++--- benchmark/config.slurm | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index eb958b1a..47296677 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -31,7 +31,7 @@ jobs: - name: Submit Slurm Job and Capture Job ID id: submit_job run: | - ssh raina.ans@login.explorer.northeastern.edu "cd /projects/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt + ssh raina.ans@login.explorer.northeastern.edu "cd /projects/VitekLab/Projects/Benchmarking/benchmark && sbatch config.slurm" | tee slurm_job_id.txt slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) echo "Slurm Job ID is $slurm_job_id" echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV @@ -48,8 +48,8 @@ jobs: - name: Fetch Output run: | - scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt - scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt + scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/benchmark/job_output.txt job_output.txt + scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/benchmark/job_error.txt job_error.txt - name: Upload Output as Artifact uses: actions/upload-artifact@v4 diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 054b7f17..6f1e7545 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --job-name=msstats_benchmark_job_updated -#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/ +#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/benchmark #SBATCH --output=job_output.txt #SBATCH --error=job_error.txt #SBATCH --open-mode=append From d2275bd5d5e363c63c3e55edc23a3e489cb13315 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 13 Jun 2025 20:08:46 -0400 Subject: [PATCH 07/45] Changes for path correction in slurm file --- benchmark/config.slurm | 2 +- benchmark/scriptController.json | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 6f1e7545..20cf2114 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -1,6 +1,6 @@ #!/bin/bash #SBATCH --job-name=msstats_benchmark_job_updated -#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/benchmark +#SBATCH --chdir=/projects/VitekLab/Projects/Benchmarking/benchmark #SBATCH --output=job_output.txt #SBATCH --error=job_error.txt #SBATCH --open-mode=append diff --git a/benchmark/scriptController.json b/benchmark/scriptController.json index d1a10a78..3881cf05 100644 --- a/benchmark/scriptController.json +++ b/benchmark/scriptController.json @@ -2,7 +2,7 @@ "datasets": { "DDA-Puyvelde2022-HYE5600735_LFQ": { "name": "DDA-Puyvelde2022-HYE5600735_LFQ", - "file": "/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats_fixed.csv", + "file": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats_fixed.csv", "samples": { "Human": { "pattern": "_HUMAN$", @@ -20,7 +20,7 @@ }, "DDA-Dowell2021-HEqe408_LFQ": { "name": "DDA-Dowell2021-HEqe408_LFQ", - "file": "/work/VitekLab/Data/MS/Benchmarking/DDA-Dowell2021-HEqe408_LFQ/FragPipe/TOP0/MSstats.csv", + "file": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Dowell2021-HEqe408_LFQ/FragPipe/TOP0/MSstats.csv", "samples": { "Human": { "pattern": "_HUMAN$", From 7300bcc7bd2595e88477ee8bf2d9273f550eeb9d Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Sat, 14 Jun 2025 13:12:09 -0400 Subject: [PATCH 08/45] Removed wrong character in script --- benchmark/config.slurm | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 20cf2114..6956f8c4 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -8,8 +8,7 @@ #SBATCH --ntasks=1 # Number of tasks (one process) #SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task #SBATCH --mem=256G # Request 256GB of memory -#SBATCH --partition=short # Use the 'short' partition (or change as needed) - +#SBATCH --partition=short module load R-geospatial module load gcc/11.1.0 @@ -27,7 +26,7 @@ remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_ remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" -R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R", "benchmark_Metamorpheus.R") +R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R") for script in "${R_SCRIPTS[@]}"; do echo "Executing script: $script" >> job_output.txt From 79e0a2d7407cc11165013ea2b4eca4c43d0ac219 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Mon, 16 Jun 2025 17:09:06 -0400 Subject: [PATCH 09/45] Changes for gcc added --- benchmark/config.slurm | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 6956f8c4..4a4877a0 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -9,13 +9,12 @@ #SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task #SBATCH --mem=256G # Request 256GB of memory #SBATCH --partition=short -module load R-geospatial +module load R -module load gcc/11.1.0 -module load cmake/3.23.2 +module load cmake/3.30.2 export LC_ALL=C -export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial +export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 mkdir -p $R_LIBS_USER From 434c1c0150e69181cdbfb92d96fdb064ca5e623d Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 10:38:27 -0400 Subject: [PATCH 10/45] Changing back R-LIBS_User env in config --- benchmark/config.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 4a4877a0..fd6b010e 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -14,7 +14,7 @@ module load R module load cmake/3.30.2 export LC_ALL=C -export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 +export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial mkdir -p $R_LIBS_USER From 83958b8e1b05e0024a5f123270abae932659c304 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 14:29:34 -0400 Subject: [PATCH 11/45] changes done to fix env --- benchmark/benchmark_Metamorpheus.R | 3 ++- benchmark/config.slurm | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 41c27e59..0d224bcc 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -1,7 +1,8 @@ library(MSstatsConvert) library(MSstats) -library(tidyverse) +library(stringr) library(parallel) +library(jsonlite) source("metamorpheus_Process.R") diff --git a/benchmark/config.slurm b/benchmark/config.slurm index fd6b010e..4a4877a0 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -14,7 +14,7 @@ module load R module load cmake/3.30.2 export LC_ALL=C -export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial +export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 mkdir -p $R_LIBS_USER From bf4938d25dd5e8c2e422c976f8aa0089e7aec3fa Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 15:00:44 -0400 Subject: [PATCH 12/45] Changes for failing package --- benchmark/config.slurm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 4a4877a0..500aef0d 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -19,8 +19,7 @@ export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 mkdir -p $R_LIBS_USER -module load R -Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \ +Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages(c('remotes', 'nloptr'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \ remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \ remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" From 6b784f1c8cde5034c557b93121622f481bd502e2 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 15:41:41 -0400 Subject: [PATCH 13/45] Added changes for path --- benchmark/config.slurm | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 500aef0d..1a6cee9a 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -14,6 +14,7 @@ module load R module load cmake/3.30.2 export LC_ALL=C +export LD_LIBRARY_PATH=/shared/EL9/explorer/R/4.4.1/lib64/R/lib:$LD_LIBRARY_PATH export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 From 959d271d3dd6f77a7e4bb9f372340352db6a9e35 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 16:55:28 -0400 Subject: [PATCH 14/45] Changes added for nolptr --- benchmark/config.slurm | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 1a6cee9a..175c4cc5 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -20,10 +20,17 @@ export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 mkdir -p $R_LIBS_USER -Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages(c('remotes', 'nloptr'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \ -remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \ -remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ -install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" +Rscript -e " +if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); +for (pkg in c('dplyr', 'stringr', 'ggplot2')) { + if (!requireNamespace(pkg, quietly = TRUE)) { + install.packages(pkg, lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org') + } +} +remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); +remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); +" + R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R") From 138c802f2ffbe693db5809a03fc45904cefa5b48 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 17:46:15 -0400 Subject: [PATCH 15/45] Changes for Library path --- benchmark/config.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 175c4cc5..dc677982 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -14,8 +14,8 @@ module load R module load cmake/3.30.2 export LC_ALL=C -export LD_LIBRARY_PATH=/shared/EL9/explorer/R/4.4.1/lib64/R/lib:$LD_LIBRARY_PATH export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 +export LD_LIBRARY_PATH=/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH mkdir -p $R_LIBS_USER From 1980ad92658cf53623817a0de88511fcf7c6b6d1 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 20:00:37 -0400 Subject: [PATCH 16/45] Added changes for POC #1 --- benchmark/config.slurm | 50 ++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index dc677982..42d6da0e 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -4,40 +4,52 @@ #SBATCH --output=job_output.txt #SBATCH --error=job_error.txt #SBATCH --open-mode=append -#SBATCH --time=01:00:00 # Set the maximum run time -#SBATCH --ntasks=1 # Number of tasks (one process) -#SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task -#SBATCH --mem=256G # Request 256GB of memory -#SBATCH --partition=short -module load R +#SBATCH --time=01:00:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=256G +#SBATCH --partition=short +module load R module load cmake/3.30.2 export LC_ALL=C export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 -export LD_LIBRARY_PATH=/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH mkdir -p $R_LIBS_USER -Rscript -e " -if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); -for (pkg in c('dplyr', 'stringr', 'ggplot2')) { - if (!requireNamespace(pkg, quietly = TRUE)) { - install.packages(pkg, lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org') - } +echo "Running ldd on preprocessCore.so" >> job_output.txt +ldd $(find ~/R -name preprocessCore.so | head -n 1) >> job_output.txt 2>> job_error.txt +echo "Running ldd on nloptr.so" >> job_output.txt +ldd $(find ~/R -name nloptr.so | head -n 1) >> job_output.txt 2>> job_error.txt + +Rscript -e ' +.libPaths("/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4") +Sys.unsetenv("R_LIBS") +cat("R version: ", R.version$version.string, "\n") +cat(".libPaths():\n"); print(.libPaths()) + +if (!requireNamespace("remotes", quietly = TRUE)) + install.packages("remotes", repos = "https://cloud.r-project.org") + +for (pkg in c("dplyr", "stringr", "ggplot2")) { + if (!requireNamespace(pkg, quietly = TRUE)) { + install.packages(pkg, repos = "https://cloud.r-project.org") + } } -remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); -remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); -" +remotes::install_github("Vitek-Lab/MSstats", ref = "devel") +remotes::install_github("Vitek-Lab/MSstatsConvert", ref = "master") +' R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R") for script in "${R_SCRIPTS[@]}"; do - echo "Executing script: $script" >> job_output.txt + echo "Executing script: $script" >> job_output.txt Rscript "$script" >> job_output.txt 2>> job_error.txt - wait - echo "Finished executing script: $script" >> job_output.txt + wait + echo "Finished executing script: $script" >> job_output.txt echo -e "\n\n" done \ No newline at end of file From 331ba24b3dc72b74ecad75a3d34e7689a88082b6 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Tue, 17 Jun 2025 23:24:38 -0400 Subject: [PATCH 17/45] Added symlink of error package in our directory --- benchmark/config.slurm | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 42d6da0e..f66bd5ab 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -16,7 +16,10 @@ module load cmake/3.30.2 export LC_ALL=C export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 -export LD_LIBRARY_PATH=/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH +mkdir -p $HOME/lib_fix +ln -sf /shared/EL9/explorer/R/4.4.1/lib64/R/lib/libRlapack.so $HOME/lib_fix/libRlapack.so + +export LD_LIBRARY_PATH=$HOME/lib_fix:/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH mkdir -p $R_LIBS_USER @@ -28,7 +31,6 @@ ldd $(find ~/R -name nloptr.so | head -n 1) >> job_output.txt 2>> job_error.txt Rscript -e ' .libPaths("/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4") Sys.unsetenv("R_LIBS") -cat("R version: ", R.version$version.string, "\n") cat(".libPaths():\n"); print(.libPaths()) if (!requireNamespace("remotes", quietly = TRUE)) From 5c497f47b57a92539eee57f847543766c3573a6f Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 13:15:49 -0400 Subject: [PATCH 18/45] Changes for slurm --- benchmark/config.slurm | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index f66bd5ab..2e7f78d6 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -15,13 +15,12 @@ module load cmake/3.30.2 export LC_ALL=C export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 +mkdir -p $R_LIBS_USER mkdir -p $HOME/lib_fix -ln -sf /shared/EL9/explorer/R/4.4.1/lib64/R/lib/libRlapack.so $HOME/lib_fix/libRlapack.so - -export LD_LIBRARY_PATH=$HOME/lib_fix:/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH +ln -sf /shared/EL9/explorer/R/4.4.1/lib64/R/lib/libRlapack.so $HOME/lib_fix/libRlapack.so.3 -mkdir -p $R_LIBS_USER +export LD_LIBRARY_PATH=$HOME/lib_fix:/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib64:$LD_LIBRARY_PATH echo "Running ldd on preprocessCore.so" >> job_output.txt ldd $(find ~/R -name preprocessCore.so | head -n 1) >> job_output.txt 2>> job_error.txt @@ -36,6 +35,8 @@ cat(".libPaths():\n"); print(.libPaths()) if (!requireNamespace("remotes", quietly = TRUE)) install.packages("remotes", repos = "https://cloud.r-project.org") +install.packages("nloptr", type = "source", repos = "https://cloud.r-project.org") + for (pkg in c("dplyr", "stringr", "ggplot2")) { if (!requireNamespace(pkg, quietly = TRUE)) { install.packages(pkg, repos = "https://cloud.r-project.org") From d78a1e9b2a2875b233ea44e20d0e2c6575cf8bee Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 18:20:49 -0400 Subject: [PATCH 19/45] Changes for lesser RAM --- benchmark/config.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 2e7f78d6..9ed27a4c 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -7,7 +7,7 @@ #SBATCH --time=01:00:00 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=8 -#SBATCH --mem=256G +#SBATCH --mem=128G #SBATCH --partition=short module load R From 4917efe59e8f2ccc291a59d2aec838b5b2661aa6 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 18:59:35 -0400 Subject: [PATCH 20/45] Changes for MSStats Convert added --- benchmark/config.slurm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 9ed27a4c..4b071c44 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -42,9 +42,8 @@ for (pkg in c("dplyr", "stringr", "ggplot2")) { install.packages(pkg, repos = "https://cloud.r-project.org") } } - +BiocManager::install(c("MSstatsConvert", "preprocessCore"), force = TRUE) remotes::install_github("Vitek-Lab/MSstats", ref = "devel") -remotes::install_github("Vitek-Lab/MSstatsConvert", ref = "master") ' R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R") From 90ca387c43d231e6a21b73990798880910f30187 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 19:50:25 -0400 Subject: [PATCH 21/45] Changes for MSStats --- benchmark/config.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 4b071c44..8eab27e7 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -43,7 +43,7 @@ for (pkg in c("dplyr", "stringr", "ggplot2")) { } } BiocManager::install(c("MSstatsConvert", "preprocessCore"), force = TRUE) -remotes::install_github("Vitek-Lab/MSstats", ref = "devel") +remotes::install_github("Vitek-Lab/MSstats", ref = "devel", force = TRUE) ' R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R") From 09cacfae1b9092e15c3b6d152649ca811bb624aa Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 21:03:21 -0400 Subject: [PATCH 22/45] Changes for Script with fix --- benchmark/config.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 8eab27e7..fce5d7ef 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -50,7 +50,7 @@ R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE56007 for script in "${R_SCRIPTS[@]}"; do echo "Executing script: $script" >> job_output.txt - Rscript "$script" >> job_output.txt 2>> job_error.txt + stdbuf -oL -eL Rscript "$script" >> job_output.txt 2>> job_error.txt wait echo "Finished executing script: $script" >> job_output.txt echo -e "\n\n" From e0472b9f30a13c55e1a0f999adf929d7ad684e51 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 21:21:52 -0400 Subject: [PATCH 23/45] Changes to debug output --- benchmark/benchmark_Metamorpheus.R | 34 +++++++++++++++++++++++++----- benchmark/config.slurm | 1 - 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 0d224bcc..9d01d53d 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -68,13 +68,37 @@ summarized_results <- mclapply(data_process_tasks, function(task) { list(label = task$label, summarized = task$result()) }, mc.cores = num_cores) +cat("Summarized Results:\n") +print(str(summarized_results)) +flush.console() + + +results_list <- lapply(summarized_results, function(res) { + cat("Processing result for:", res$label, "\n") + flush.console() + out <- tryCatch({ + calculate_Metrics(res$summarized, protein_mappings, res$label) + }, error = function(e) { + message("Error in calculate_Metrics for ", res$label, ": ", e$message) + NULL + }) + print(str(out)) + flush.console() + out +}) + +cat("Results List structure:\n") +print(str(results_list)) +flush.console() + +final_results <- tryCatch({ + do.call(rbind, results_list) +}, error = function(e) { + message("Error during rbind: ", e$message) + NULL +}) -results_list <- mclapply(summarized_results, function(res) { - calculate_Metrics(res$summarized, protein_mappings, res$label) -}, mc.cores = num_cores) - -final_results <- do.call(rbind, results_list) end_time <- Sys.time() total_time <- end_time - start_time print(final_results) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index fce5d7ef..92d8ad03 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -3,7 +3,6 @@ #SBATCH --chdir=/projects/VitekLab/Projects/Benchmarking/benchmark #SBATCH --output=job_output.txt #SBATCH --error=job_error.txt -#SBATCH --open-mode=append #SBATCH --time=01:00:00 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=8 From 2d8e3b3ad4ba3077b107655f3dfee33e053ea91d Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 21:32:51 -0400 Subject: [PATCH 24/45] Change for Script order --- benchmark/config.slurm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 92d8ad03..a4f7d4a9 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -45,7 +45,7 @@ BiocManager::install(c("MSstatsConvert", "preprocessCore"), force = TRUE) remotes::install_github("Vitek-Lab/MSstats", ref = "devel", force = TRUE) ' -R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R") +R_SCRIPTS=("benchmark_Metamorpheus.R" "benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R") for script in "${R_SCRIPTS[@]}"; do echo "Executing script: $script" >> job_output.txt From 73e6bcc582f7500454b236662761a7c20a874268 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 22:00:29 -0400 Subject: [PATCH 25/45] Changes to see file print --- benchmark/benchmark_Metamorpheus.R | 114 ++++++++++++++++------------- benchmark/config.slurm | 7 +- 2 files changed, 63 insertions(+), 58 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 9d01d53d..54005d5a 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -6,89 +6,97 @@ library(jsonlite) source("metamorpheus_Process.R") -config <- fromJSON("scriptController.json", simplifyVector = FALSE) +options(echo = TRUE) +config <- fromJSON("scriptController.json", simplifyVector = FALSE) dataset_config <- config$datasets[["DDA-Solivais2024-Metamorpheus_NoMBR_LFQ"]] dataset_config <- as.list(dataset_config) cat("Processing Dataset:", dataset_config$name, "\n") - filePath <- file.path(dataset_config$parent, dataset_config$data) annotPath <- dataset_config$parent +cat("File Path:", filePath, "\n") +cat("Annotation Path:", annotPath, "\n") +flush.console() + +input <- data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) +cat("Input loaded: ", nrow(input), " rows\n") +flush.console() -input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) -annot = data.table::fread(file.path(annotPath, "annotation.csv")) +annot <- data.table::fread(file.path(annotPath, "annotation.csv")) +cat("Annotation loaded: ", nrow(annot), " rows\n") +flush.console() +# Filters +input <- input %>% filter(!str_detect(`Protein Group`, ";")) +cat("Post semicolon filter: ", nrow(input), " rows\n") +input <- input %>% filter(!str_detect(`Protein Group`, "DECOY")) +cat("Post DECOY filter: ", nrow(input), " rows\n") +flush.console() -cat("Dataset File Path:", filePath, "\n") -cat("Dataset File Path:", annotPath, "\n") +protein_mappings <- data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) +cat("Protein mappings loaded: ", nrow(protein_mappings), " rows\n") +protein_mappings <- protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) +cat("Post organism filter: ", nrow(protein_mappings), " rows\n") +flush.console() -input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell -input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys +input <- input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) +cat("Post protein group mapping filter: ", nrow(input), " rows\n") +flush.console() -protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) -protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) +output <- tryCatch({ + MetamorpheusToMSstatsFormat(input, annot) +}, error = function(e) { + message("Error in MetamorpheusToMSstatsFormat: ", e$message) + NULL +}) +cat("MSstats format complete\n") +flush.console() -input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) +if (is.null(output)) stop("Output is NULL, aborting") -output = MetamorpheusToMSstatsFormat(input, annot) +# DEBUG: check structure +cat("Structure of MSstats formatted output:\n") +print(str(output)) +flush.console() data_process_tasks <- list( - list( - label = "Data process with Normalized Data", - result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20) - ), - list( - label = "Data process with Normalization and MBImpute False", - result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) - ), - list( - label = "Data process without Normalization", - result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) - ), - list( - label = "Data process without Normalization with MBImpute False", - result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) - ), - list( - label = "Data process without Normalization and Imputation On for all features", - result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) - ), - list( - label = "Data process without Normalization and Imputation On for top3 features", - result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) - ) + list(label = "Normalized", result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20)), + list(label = "Norm + MBimpute=FALSE", result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE)), + list(label = "No Norm", result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20)), + list(label = "No Norm + MBimpute=FALSE", result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE)), + list(label = "No Norm + Impute all", result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE)), + list(label = "No Norm + Impute top3", result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE)) ) - -start_time <- Sys.time() +start_time <- Sys.time() num_cores <- detectCores() - 1 -summarized_results <- mclapply(data_process_tasks, function(task) { +# Debug: Use lapply instead of mclapply +summarized_results <- lapply(data_process_tasks, function(task) { + cat("Running task:", task$label, "\n") + flush.console() list(label = task$label, summarized = task$result()) -}, mc.cores = num_cores) +}) -cat("Summarized Results:\n") -print(str(summarized_results)) +cat("Completed summarization tasks\n") flush.console() - results_list <- lapply(summarized_results, function(res) { cat("Processing result for:", res$label, "\n") flush.console() - out <- tryCatch({ - calculate_Metrics(res$summarized, protein_mappings, res$label) + tryCatch({ + out <- calculate_Metrics(res$summarized, protein_mappings, res$label) + print(str(out)) + flush.console() + out }, error = function(e) { message("Error in calculate_Metrics for ", res$label, ": ", e$message) NULL }) - print(str(out)) - flush.console() - out }) -cat("Results List structure:\n") -print(str(results_list)) +cat("All metrics calculated\n") flush.console() final_results <- tryCatch({ @@ -98,8 +106,10 @@ final_results <- tryCatch({ NULL }) - end_time <- Sys.time() total_time <- end_time - start_time + +cat("Final Results:\n") print(final_results) -print(paste("Total Execution Time:", total_time)) \ No newline at end of file +print(paste("Total Execution Time:", total_time)) +flush.console() diff --git a/benchmark/config.slurm b/benchmark/config.slurm index a4f7d4a9..5b4d610d 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -21,11 +21,6 @@ ln -sf /shared/EL9/explorer/R/4.4.1/lib64/R/lib/libRlapack.so $HOME/lib_fix/libR export LD_LIBRARY_PATH=$HOME/lib_fix:/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib64:$LD_LIBRARY_PATH -echo "Running ldd on preprocessCore.so" >> job_output.txt -ldd $(find ~/R -name preprocessCore.so | head -n 1) >> job_output.txt 2>> job_error.txt -echo "Running ldd on nloptr.so" >> job_output.txt -ldd $(find ~/R -name nloptr.so | head -n 1) >> job_output.txt 2>> job_error.txt - Rscript -e ' .libPaths("/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4") Sys.unsetenv("R_LIBS") @@ -45,7 +40,7 @@ BiocManager::install(c("MSstatsConvert", "preprocessCore"), force = TRUE) remotes::install_github("Vitek-Lab/MSstats", ref = "devel", force = TRUE) ' -R_SCRIPTS=("benchmark_Metamorpheus.R" "benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R") +R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R" ) for script in "${R_SCRIPTS[@]}"; do echo "Executing script: $script" >> job_output.txt From 244d42f09622060484120d2ec1c2a86148907089 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Wed, 18 Jun 2025 23:43:34 -0400 Subject: [PATCH 26/45] Corrections added for metamorpheus script file --- benchmark/benchmark_Metamorpheus.R | 147 +++++++++++------------------ 1 file changed, 56 insertions(+), 91 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 54005d5a..41c27e59 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -1,115 +1,80 @@ library(MSstatsConvert) library(MSstats) -library(stringr) +library(tidyverse) library(parallel) -library(jsonlite) source("metamorpheus_Process.R") -options(echo = TRUE) - config <- fromJSON("scriptController.json", simplifyVector = FALSE) + dataset_config <- config$datasets[["DDA-Solivais2024-Metamorpheus_NoMBR_LFQ"]] dataset_config <- as.list(dataset_config) cat("Processing Dataset:", dataset_config$name, "\n") + filePath <- file.path(dataset_config$parent, dataset_config$data) annotPath <- dataset_config$parent -cat("File Path:", filePath, "\n") -cat("Annotation Path:", annotPath, "\n") -flush.console() - -input <- data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) -cat("Input loaded: ", nrow(input), " rows\n") -flush.console() - -annot <- data.table::fread(file.path(annotPath, "annotation.csv")) -cat("Annotation loaded: ", nrow(annot), " rows\n") -flush.console() - -# Filters -input <- input %>% filter(!str_detect(`Protein Group`, ";")) -cat("Post semicolon filter: ", nrow(input), " rows\n") -input <- input %>% filter(!str_detect(`Protein Group`, "DECOY")) -cat("Post DECOY filter: ", nrow(input), " rows\n") -flush.console() - -protein_mappings <- data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) -cat("Protein mappings loaded: ", nrow(protein_mappings), " rows\n") -protein_mappings <- protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) -cat("Post organism filter: ", nrow(protein_mappings), " rows\n") -flush.console() - -input <- input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) -cat("Post protein group mapping filter: ", nrow(input), " rows\n") -flush.console() - -output <- tryCatch({ - MetamorpheusToMSstatsFormat(input, annot) -}, error = function(e) { - message("Error in MetamorpheusToMSstatsFormat: ", e$message) - NULL -}) -cat("MSstats format complete\n") -flush.console() - -if (is.null(output)) stop("Output is NULL, aborting") - -# DEBUG: check structure -cat("Structure of MSstats formatted output:\n") -print(str(output)) -flush.console() + +input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) +annot = data.table::fread(file.path(annotPath, "annotation.csv")) + + +cat("Dataset File Path:", filePath, "\n") +cat("Dataset File Path:", annotPath, "\n") + +input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell +input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys + +protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) +protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) + +input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + +output = MetamorpheusToMSstatsFormat(input, annot) data_process_tasks <- list( - list(label = "Normalized", result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20)), - list(label = "Norm + MBimpute=FALSE", result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE)), - list(label = "No Norm", result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20)), - list(label = "No Norm + MBimpute=FALSE", result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE)), - list(label = "No Norm + Impute all", result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE)), - list(label = "No Norm + Impute top3", result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE)) + list( + label = "Data process with Normalized Data", + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20) + ), + list( + label = "Data process with Normalization and MBImpute False", + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization", + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) + ), + list( + label = "Data process without Normalization with MBImpute False", + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for all features", + result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for top3 features", + result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) + ) ) - + start_time <- Sys.time() + num_cores <- detectCores() - 1 -# Debug: Use lapply instead of mclapply -summarized_results <- lapply(data_process_tasks, function(task) { - cat("Running task:", task$label, "\n") - flush.console() +summarized_results <- mclapply(data_process_tasks, function(task) { list(label = task$label, summarized = task$result()) -}) - -cat("Completed summarization tasks\n") -flush.console() - -results_list <- lapply(summarized_results, function(res) { - cat("Processing result for:", res$label, "\n") - flush.console() - tryCatch({ - out <- calculate_Metrics(res$summarized, protein_mappings, res$label) - print(str(out)) - flush.console() - out - }, error = function(e) { - message("Error in calculate_Metrics for ", res$label, ": ", e$message) - NULL - }) -}) - -cat("All metrics calculated\n") -flush.console() - -final_results <- tryCatch({ - do.call(rbind, results_list) -}, error = function(e) { - message("Error during rbind: ", e$message) - NULL -}) +}, mc.cores = num_cores) + +results_list <- mclapply(summarized_results, function(res) { + calculate_Metrics(res$summarized, protein_mappings, res$label) +}, mc.cores = num_cores) + + +final_results <- do.call(rbind, results_list) end_time <- Sys.time() total_time <- end_time - start_time - -cat("Final Results:\n") print(final_results) -print(paste("Total Execution Time:", total_time)) -flush.console() +print(paste("Total Execution Time:", total_time)) \ No newline at end of file From 462925f9d136129d2d8bfb667fbfc077b5bfdf8a Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Thu, 19 Jun 2025 00:43:21 -0400 Subject: [PATCH 27/45] Rerun metamorpheus benchmark --- benchmark/benchmark_Metamorpheus.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 41c27e59..da5ceb5b 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -1,7 +1,8 @@ library(MSstatsConvert) library(MSstats) -library(tidyverse) library(parallel) +library(stringr) +library(jsonlite) source("metamorpheus_Process.R") From 1b962cf23b3b4c6c17b1a55826a4423312961df1 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Thu, 19 Jun 2025 00:58:29 -0400 Subject: [PATCH 28/45] Changes for library --- benchmark/benchmark_Metamorpheus.R | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index da5ceb5b..e57b3b7d 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -3,6 +3,7 @@ library(MSstats) library(parallel) library(stringr) library(jsonlite) +library(dplyr) source("metamorpheus_Process.R") From f0ec62156bc92ed64371fa54774567ebeb7e2eab Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Thu, 19 Jun 2025 14:51:39 -0400 Subject: [PATCH 29/45] Changes for MBR --- benchmark/benchmark_Metamorpheus.R | 138 +++++++++++++++-------------- benchmark/scriptController.json | 15 ++++ 2 files changed, 88 insertions(+), 65 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index e57b3b7d..33bb366d 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -6,77 +6,85 @@ library(jsonlite) library(dplyr) source("metamorpheus_Process.R") - config <- fromJSON("scriptController.json", simplifyVector = FALSE) -dataset_config <- config$datasets[["DDA-Solivais2024-Metamorpheus_NoMBR_LFQ"]] -dataset_config <- as.list(dataset_config) - -cat("Processing Dataset:", dataset_config$name, "\n") - -filePath <- file.path(dataset_config$parent, dataset_config$data) -annotPath <- dataset_config$parent - -input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) -annot = data.table::fread(file.path(annotPath, "annotation.csv")) - - -cat("Dataset File Path:", filePath, "\n") -cat("Dataset File Path:", annotPath, "\n") - -input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell -input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys - -protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) -protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) - -input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - -output = MetamorpheusToMSstatsFormat(input, annot) - -data_process_tasks <- list( - list( - label = "Data process with Normalized Data", - result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20) - ), - list( - label = "Data process with Normalization and MBImpute False", - result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) - ), - list( - label = "Data process without Normalization", - result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) - ), - list( - label = "Data process without Normalization with MBImpute False", - result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) - ), - list( - label = "Data process without Normalization and Imputation On for all features", - result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) - ), - list( - label = "Data process without Normalization and Imputation On for top3 features", - result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) +runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { + + dataset_config <- config$datasets[[datasetPath]] + dataset_config <- as.list(dataset_config) + + cat("Processing Dataset:", dataset_config$name, "\n") + + filePath <- file.path(dataset_config$parent, dataset_config$data) + annotPath <- dataset_config$parent + + input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) + annot = data.table::fread(file.path(annotPath, "annotation.csv")) + + + cat("Dataset File Path:", filePath, "\n") + cat("Annotation File Path:", annotPath, "\n") + + input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell + input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys + + protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) + protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) + + input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + + output = MetamorpheusToMSstatsFormat(input, annot) + + data_process_tasks <- list( + list( + label = "Data process with Normalized Data", + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20) + ), + list( + label = "Data process with Normalization and MBImpute False", + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization", + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) + ), + list( + label = "Data process without Normalization with MBImpute False", + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for all features", + result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for top3 features", + result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) + ) ) -) - -start_time <- Sys.time() + + start_time <- Sys.time() + + num_cores <- detectCores() - 1 + + summarized_results <- mclapply(data_process_tasks, function(task) { + list(label = task$label, summarized = task$result()) + }, mc.cores = num_cores) + + + results_list <- mclapply(summarized_results, function(res) { + calculate_Metrics(res$summarized, protein_mappings, res$label) + }, mc.cores = num_cores) -num_cores <- detectCores() - 1 -summarized_results <- mclapply(data_process_tasks, function(task) { - list(label = task$label, summarized = task$result()) -}, mc.cores = num_cores) + final_results <- do.call(rbind, results_list) + end_time <- Sys.time() + total_time <- end_time - start_time + print(final_results) + print(paste("Total Execution Time:", total_time)) +} -results_list <- mclapply(summarized_results, function(res) { - calculate_Metrics(res$summarized, protein_mappings, res$label) -}, mc.cores = num_cores) -final_results <- do.call(rbind, results_list) -end_time <- Sys.time() -total_time <- end_time - start_time -print(final_results) -print(paste("Total Execution Time:", total_time)) \ No newline at end of file +runBenchmarkForMetaMorpheusData("DDA-Solivais2024-Metamorpheus_MBR_LFQ", config) +runBenchmarkForMetaMorpheusData("DDA-Solivais2024-Metamorpheus_NoMBR_LFQ", config) \ No newline at end of file diff --git a/benchmark/scriptController.json b/benchmark/scriptController.json index 3881cf05..449c667a 100644 --- a/benchmark/scriptController.json +++ b/benchmark/scriptController.json @@ -46,6 +46,21 @@ "type": "significant" } } + }, + "DDA-Solivais2024-Metamorpheus_MBR_LFQ": { + "name": "DDA-Solivais2024-Metamorpheus_MBR_LFQ", + "parent": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current", + "data":"FlashLFQ_v1.0_NoNormalization_wPIP", + "samples": { + "Human": { + "pattern": "_HUMAN$", + "type": "insignificant" + }, + "Ecoli": { + "pattern": "_ECOLI$", + "type": "significant" + } + } } } } \ No newline at end of file From ec9fe42548eeae08f00dafbcd2cf44664d58e4cf Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 13:15:14 -0400 Subject: [PATCH 30/45] Changes for calculate metrics --- benchmark/metamorpheus_Process.R | 84 +++++++++++++++++--------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index 9954d3eb..cd6c4fba 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -1,41 +1,47 @@ -calculate_Metrics <- function(QuantData, protein_mappings, label){ - - # dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) - - comparison <- matrix(c(-1,0,0,0,1, # 3x - -1,0,0,1,0, # 2.5x - -1,0,1,0,0, # 2x - -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x - - row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") - groups = levels(QuantData$ProteinLevelData$GROUP) - colnames(comparison) <- groups[order(as.numeric(groups))] - model <- groupComparison(contrast.matrix=comparison, data=QuantData, - use_log_file = FALSE) - - - - ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") - model$ComparisonResult = model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) - - - e_group = model$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) - - ecoli = e_group %>% filter(ecoli == TRUE) - - # hist(ecoli$log2FC) - - ecoli = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) - human = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) - FDR = nrow(human) / (nrow(ecoli) + nrow(human)) - - cat(label, FDR, "\n") - - results <- data.frame( - Label = label, - FDR = FDR +calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0.05) { + comparison <- matrix( + c(-1,0,0,0,1, # E-A + -1,0,0,1,0, # D-A + -1,0,1,0,0, # C-A + -1,1,0,0,0), # B-A + nrow = 4, byrow = TRUE + ) + rownames(comparison) <- c("E-A", "D-A", "C-A", "B-A") + groups <- levels(QuantData$ProteinLevelData$GROUP) + colnames(comparison) <- groups[order(as.numeric(groups))] + + model <- groupComparison( + contrast.matrix = comparison, + data = QuantData, + use_log_file = FALSE ) - - return(results) -} \ No newline at end of file + ecoli_ids <- protein_mappings %>% + filter(Organism == "Escherichia coli (strain K12)") %>% + pull(`Protein Groups`) + + comp <- model$ComparisonResult %>% + mutate(ecoli = Protein %in% ecoli_ids) %>% + filter(is.na(issue)) + + labels <- unique(comp$Label) + result_rows <- lapply(labels, function(lbl) { + df <- comp %>% filter(Label == lbl) + sig <- df %>% filter(adj.pvalue < alpha) + + tp <- sig %>% filter(ecoli) %>% nrow() + fp <- sig %>% filter(!ecoli) %>% nrow() + tot <- tp + fp + fdr <- if (tot > 0) fp / tot else NA_real_ + + data.frame( + Task = task_label, + Comparison = lbl, + FDR = fdr, + stringsAsFactors = FALSE + ) + }) + + results <- do.call(rbind, result_rows) + return(results) +} From c7e7b60728d6d638cd9e0d338184dad7c6ac076d Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 14:07:53 -0400 Subject: [PATCH 31/45] All PR comments resolved --- benchmark/benchmark_Metamorpheus.R | 4 +- benchmark/metamorpheus_Process.R | 6 +- metamorpheus_code.R | 112 ----------------------------- 3 files changed, 6 insertions(+), 116 deletions(-) delete mode 100644 metamorpheus_code.R diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 33bb366d..4003d207 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -29,7 +29,9 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) - protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) + valid_organisms <- unique(input$Organism) + + protein_mappings = protein_mappings %>% filter(Organism %in% valid_organisms) input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index cd6c4fba..dcf7fce1 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -20,13 +20,13 @@ calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0 filter(Organism == "Escherichia coli (strain K12)") %>% pull(`Protein Groups`) - comp <- model$ComparisonResult %>% + filtered_results <- model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_ids) %>% filter(is.na(issue)) - labels <- unique(comp$Label) + labels <- unique(filtered_results$Label) result_rows <- lapply(labels, function(lbl) { - df <- comp %>% filter(Label == lbl) + df <- filtered_results %>% filter(Label == lbl) sig <- df %>% filter(adj.pvalue < alpha) tp <- sig %>% filter(ecoli) %>% nrow() diff --git a/metamorpheus_code.R b/metamorpheus_code.R deleted file mode 100644 index fa0c12c1..00000000 --- a/metamorpheus_code.R +++ /dev/null @@ -1,112 +0,0 @@ -library(MSstatsConvert) -library(MSstats) -library(tidyverse) - - -# No MBR -input_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedPeaks.tsv") - - - -annot_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") - - -input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell -input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys -protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedProteins.tsv") -protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) -input_no_mbr = input_no_mbr %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - - -# input_no_mbr$`Protein Group` = ifelse( -# input_no_mbr$`Protein Group` %in% ecoli$`Protein Groups`, -# paste(input_no_mbr$`Protein Group`, "|ECOLI", sep = ""), -# paste(input_no_mbr$`Protein Group`, "|HUMAN", sep = "")) -# write.csv(input_no_mbr, "QuantifiedPeaks.csv", row.names = FALSE) - - -output_no_mbr = MetamorpheusToMSstatsFormat(input_no_mbr, annot_no_mbr) -QuantData_no_mbr = dataProcess(output_no_mbr, normalization = FALSE) - -dataProcessPlots(QuantData_no_mbr, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) - -comparison <- matrix(c(-1,0,0,0,1, # 3x - -1,0,0,1,0, # 2.5x - -1,0,1,0,0, # 2x - -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x - - -row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") -groups = levels(QuantData_no_mbr$ProteinLevelData$GROUP) -colnames(comparison) <- groups[order(as.numeric(groups))] -model_no_mbr <- groupComparison(contrast.matrix=comparison, data=QuantData_no_mbr, - use_log_file = FALSE) - -library(tidyverse) -ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") -model_no_mbr$ComparisonResult = model_no_mbr$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) - -e_group_no_mbr = model_no_mbr$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) -ecoli_no_mbr = e_group_no_mbr %>% filter(ecoli == TRUE) -hist(ecoli_no_mbr$log2FC) - -ecoli_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) -human_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) -FDR_no_mbr = nrow(human_no_mbr) / (nrow(ecoli_no_mbr) + nrow(human_no_mbr)) - -cat("FDR no MBR", FDR_no_mbr, "\n") - -# With MBR -library(MSstatsConvert) -library(MSstats) -library(tidyverse) -input = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedPeaks.tsv") -annot = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") - -input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell -input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys - -protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedProteins.tsv") -protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) -input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - -# input$`Protein Group` = ifelse( -# input$`Protein Group` %in% ecoli$`Protein Groups`, -# paste(input$`Protein Group`, "|ECOLI", sep = ""), -# paste(input$`Protein Group`, "|HUMAN", sep = "")) -# write.csv(input, "QuantifiedPeaks-MBR.csv", row.names = FALSE) - - -output = MetamorpheusToMSstatsFormat(input, annot) -QuantData = dataProcess(output, normalization = FALSE) - -dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) - -comparison <- matrix(c(-1,0,0,0,1, # 3x - -1,0,0,1,0, # 2.5x - -1,0,1,0,0, # 2x - -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x -row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") -groups = levels(QuantData$ProteinLevelData$GROUP) -colnames(comparison) <- groups[order(as.numeric(groups))] -model <- groupComparison(contrast.matrix=comparison, data=QuantData, - use_log_file = FALSE) - -library(tidyverse) -ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") -model$ComparisonResult = model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) - -e_group = model$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) -ecoli = e_group %>% filter(ecoli == TRUE) -hist(ecoli$log2FC) - -ecoli = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) -human = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) -FDR = nrow(human) / (nrow(ecoli) + nrow(human)) - - -cat("FDR MBR", FDR, "\n") -# FDR no MBR seems to be lower than that of FDR with MBR (except for E-A label), but it's not by a wide margin. -# When normalization was enabled, FDR spiked to 38% without MBR and 58% with MBR. -# When we set adj.pvalue to 0.01, FDR without MBR does better, but not by much. -# Less proteins detected as significant with MBR disabled. \ No newline at end of file From d8bcb034d7d4e96fbd8825fd17ee5a16973003e2 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 14:41:24 -0400 Subject: [PATCH 32/45] Fix Bug : Unique comparisons not visible --- benchmark/metamorpheus_Process.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index dcf7fce1..e167dbe2 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -24,7 +24,7 @@ calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0 mutate(ecoli = Protein %in% ecoli_ids) %>% filter(is.na(issue)) - labels <- unique(filtered_results$Label) + labels <- rownames(comparison) result_rows <- lapply(labels, function(lbl) { df <- filtered_results %>% filter(Label == lbl) sig <- df %>% filter(adj.pvalue < alpha) From ba65d2d8e5c32c743d0083078a5672a9c1abed0c Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 15:18:44 -0400 Subject: [PATCH 33/45] Changes reverted --- benchmark/benchmark_Metamorpheus.R | 4 +- benchmark/metamorpheus_Process.R | 6 +- metamorpheus_code.R | 112 +++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 6 deletions(-) create mode 100644 metamorpheus_code.R diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 4003d207..33bb366d 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -29,9 +29,7 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) - valid_organisms <- unique(input$Organism) - - protein_mappings = protein_mappings %>% filter(Organism %in% valid_organisms) + protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index e167dbe2..cd6c4fba 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -20,13 +20,13 @@ calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0 filter(Organism == "Escherichia coli (strain K12)") %>% pull(`Protein Groups`) - filtered_results <- model$ComparisonResult %>% + comp <- model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_ids) %>% filter(is.na(issue)) - labels <- rownames(comparison) + labels <- unique(comp$Label) result_rows <- lapply(labels, function(lbl) { - df <- filtered_results %>% filter(Label == lbl) + df <- comp %>% filter(Label == lbl) sig <- df %>% filter(adj.pvalue < alpha) tp <- sig %>% filter(ecoli) %>% nrow() diff --git a/metamorpheus_code.R b/metamorpheus_code.R new file mode 100644 index 00000000..fa0c12c1 --- /dev/null +++ b/metamorpheus_code.R @@ -0,0 +1,112 @@ +library(MSstatsConvert) +library(MSstats) +library(tidyverse) + + +# No MBR +input_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedPeaks.tsv") + + + +annot_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") + + +input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell +input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys +protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedProteins.tsv") +protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) +input_no_mbr = input_no_mbr %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + + +# input_no_mbr$`Protein Group` = ifelse( +# input_no_mbr$`Protein Group` %in% ecoli$`Protein Groups`, +# paste(input_no_mbr$`Protein Group`, "|ECOLI", sep = ""), +# paste(input_no_mbr$`Protein Group`, "|HUMAN", sep = "")) +# write.csv(input_no_mbr, "QuantifiedPeaks.csv", row.names = FALSE) + + +output_no_mbr = MetamorpheusToMSstatsFormat(input_no_mbr, annot_no_mbr) +QuantData_no_mbr = dataProcess(output_no_mbr, normalization = FALSE) + +dataProcessPlots(QuantData_no_mbr, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) + +comparison <- matrix(c(-1,0,0,0,1, # 3x + -1,0,0,1,0, # 2.5x + -1,0,1,0,0, # 2x + -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x + + +row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") +groups = levels(QuantData_no_mbr$ProteinLevelData$GROUP) +colnames(comparison) <- groups[order(as.numeric(groups))] +model_no_mbr <- groupComparison(contrast.matrix=comparison, data=QuantData_no_mbr, + use_log_file = FALSE) + +library(tidyverse) +ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") +model_no_mbr$ComparisonResult = model_no_mbr$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) + +e_group_no_mbr = model_no_mbr$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) +ecoli_no_mbr = e_group_no_mbr %>% filter(ecoli == TRUE) +hist(ecoli_no_mbr$log2FC) + +ecoli_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) +human_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) +FDR_no_mbr = nrow(human_no_mbr) / (nrow(ecoli_no_mbr) + nrow(human_no_mbr)) + +cat("FDR no MBR", FDR_no_mbr, "\n") + +# With MBR +library(MSstatsConvert) +library(MSstats) +library(tidyverse) +input = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedPeaks.tsv") +annot = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") + +input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell +input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys + +protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedProteins.tsv") +protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) +input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + +# input$`Protein Group` = ifelse( +# input$`Protein Group` %in% ecoli$`Protein Groups`, +# paste(input$`Protein Group`, "|ECOLI", sep = ""), +# paste(input$`Protein Group`, "|HUMAN", sep = "")) +# write.csv(input, "QuantifiedPeaks-MBR.csv", row.names = FALSE) + + +output = MetamorpheusToMSstatsFormat(input, annot) +QuantData = dataProcess(output, normalization = FALSE) + +dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) + +comparison <- matrix(c(-1,0,0,0,1, # 3x + -1,0,0,1,0, # 2.5x + -1,0,1,0,0, # 2x + -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x +row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") +groups = levels(QuantData$ProteinLevelData$GROUP) +colnames(comparison) <- groups[order(as.numeric(groups))] +model <- groupComparison(contrast.matrix=comparison, data=QuantData, + use_log_file = FALSE) + +library(tidyverse) +ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") +model$ComparisonResult = model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) + +e_group = model$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) +ecoli = e_group %>% filter(ecoli == TRUE) +hist(ecoli$log2FC) + +ecoli = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) +human = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) +FDR = nrow(human) / (nrow(ecoli) + nrow(human)) + + +cat("FDR MBR", FDR, "\n") +# FDR no MBR seems to be lower than that of FDR with MBR (except for E-A label), but it's not by a wide margin. +# When normalization was enabled, FDR spiked to 38% without MBR and 58% with MBR. +# When we set adj.pvalue to 0.01, FDR without MBR does better, but not by much. +# Less proteins detected as significant with MBR disabled. \ No newline at end of file From 423b485d0c0ad5c6fa2000607cf2ae652f62466b Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 15:39:12 -0400 Subject: [PATCH 34/45] Removed unnecessary file --- metamorpheus_code.R | 112 -------------------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 metamorpheus_code.R diff --git a/metamorpheus_code.R b/metamorpheus_code.R deleted file mode 100644 index fa0c12c1..00000000 --- a/metamorpheus_code.R +++ /dev/null @@ -1,112 +0,0 @@ -library(MSstatsConvert) -library(MSstats) -library(tidyverse) - - -# No MBR -input_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedPeaks.tsv") - - - -annot_no_mbr = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") - - -input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell -input_no_mbr = input_no_mbr %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys -protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/QuantifiedProteins.tsv") -protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) -input_no_mbr = input_no_mbr %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - - -# input_no_mbr$`Protein Group` = ifelse( -# input_no_mbr$`Protein Group` %in% ecoli$`Protein Groups`, -# paste(input_no_mbr$`Protein Group`, "|ECOLI", sep = ""), -# paste(input_no_mbr$`Protein Group`, "|HUMAN", sep = "")) -# write.csv(input_no_mbr, "QuantifiedPeaks.csv", row.names = FALSE) - - -output_no_mbr = MetamorpheusToMSstatsFormat(input_no_mbr, annot_no_mbr) -QuantData_no_mbr = dataProcess(output_no_mbr, normalization = FALSE) - -dataProcessPlots(QuantData_no_mbr, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) - -comparison <- matrix(c(-1,0,0,0,1, # 3x - -1,0,0,1,0, # 2.5x - -1,0,1,0,0, # 2x - -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x - - -row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") -groups = levels(QuantData_no_mbr$ProteinLevelData$GROUP) -colnames(comparison) <- groups[order(as.numeric(groups))] -model_no_mbr <- groupComparison(contrast.matrix=comparison, data=QuantData_no_mbr, - use_log_file = FALSE) - -library(tidyverse) -ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") -model_no_mbr$ComparisonResult = model_no_mbr$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) - -e_group_no_mbr = model_no_mbr$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) -ecoli_no_mbr = e_group_no_mbr %>% filter(ecoli == TRUE) -hist(ecoli_no_mbr$log2FC) - -ecoli_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) -human_no_mbr = e_group_no_mbr %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) -FDR_no_mbr = nrow(human_no_mbr) / (nrow(ecoli_no_mbr) + nrow(human_no_mbr)) - -cat("FDR no MBR", FDR_no_mbr, "\n") - -# With MBR -library(MSstatsConvert) -library(MSstats) -library(tidyverse) -input = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedPeaks.tsv") -annot = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_NoNormalization_NoPIP/annotation.csv") - -input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell -input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys - -protein_mappings = data.table::fread("/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current/FlashLFQ_v1.0_NoNormalization_wPIP/QuantifiedProteins.tsv") -protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) -input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - -# input$`Protein Group` = ifelse( -# input$`Protein Group` %in% ecoli$`Protein Groups`, -# paste(input$`Protein Group`, "|ECOLI", sep = ""), -# paste(input$`Protein Group`, "|HUMAN", sep = "")) -# write.csv(input, "QuantifiedPeaks-MBR.csv", row.names = FALSE) - - -output = MetamorpheusToMSstatsFormat(input, annot) -QuantData = dataProcess(output, normalization = FALSE) - -dataProcessPlots(QuantData, "QCPlot", which.Protein = "allonly", address = FALSE, isPlotly = TRUE) - -comparison <- matrix(c(-1,0,0,0,1, # 3x - -1,0,0,1,0, # 2.5x - -1,0,1,0,0, # 2x - -1,1,0,0,0),nrow=4,byrow = TRUE) # 1.5x -row.names(comparison) <- c("E-A", "D-A", "C-A", "B-A") -groups = levels(QuantData$ProteinLevelData$GROUP) -colnames(comparison) <- groups[order(as.numeric(groups))] -model <- groupComparison(contrast.matrix=comparison, data=QuantData, - use_log_file = FALSE) - -library(tidyverse) -ecoli_proteins = protein_mappings %>% filter(Organism == "Escherichia coli (strain K12)") -model$ComparisonResult = model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_proteins$`Protein Groups`) - -e_group = model$ComparisonResult %>% filter(Label == "B-A") %>% filter(is.na(issue)) -ecoli = e_group %>% filter(ecoli == TRUE) -hist(ecoli$log2FC) - -ecoli = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == TRUE) -human = e_group %>% filter(adj.pvalue < 0.05) %>% filter(ecoli == FALSE) -FDR = nrow(human) / (nrow(ecoli) + nrow(human)) - - -cat("FDR MBR", FDR, "\n") -# FDR no MBR seems to be lower than that of FDR with MBR (except for E-A label), but it's not by a wide margin. -# When normalization was enabled, FDR spiked to 38% without MBR and 58% with MBR. -# When we set adj.pvalue to 0.01, FDR without MBR does better, but not by much. -# Less proteins detected as significant with MBR disabled. \ No newline at end of file From 7485d9519545cbed3c56d885a66d3d4709b25f1d Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 15:58:48 -0400 Subject: [PATCH 35/45] Changes for variable name correction --- benchmark/metamorpheus_Process.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index cd6c4fba..dcf7fce1 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -20,13 +20,13 @@ calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0 filter(Organism == "Escherichia coli (strain K12)") %>% pull(`Protein Groups`) - comp <- model$ComparisonResult %>% + filtered_results <- model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_ids) %>% filter(is.na(issue)) - labels <- unique(comp$Label) + labels <- unique(filtered_results$Label) result_rows <- lapply(labels, function(lbl) { - df <- comp %>% filter(Label == lbl) + df <- filtered_results %>% filter(Label == lbl) sig <- df %>% filter(adj.pvalue < alpha) tp <- sig %>% filter(ecoli) %>% nrow() From 4fb9d062ba3722fa7d2bd0795bf630c9c2adecaf Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 16:09:05 -0400 Subject: [PATCH 36/45] Variable rename revert --- benchmark/metamorpheus_Process.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index dcf7fce1..cd6c4fba 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -20,13 +20,13 @@ calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0 filter(Organism == "Escherichia coli (strain K12)") %>% pull(`Protein Groups`) - filtered_results <- model$ComparisonResult %>% + comp <- model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_ids) %>% filter(is.na(issue)) - labels <- unique(filtered_results$Label) + labels <- unique(comp$Label) result_rows <- lapply(labels, function(lbl) { - df <- filtered_results %>% filter(Label == lbl) + df <- comp %>% filter(Label == lbl) sig <- df %>% filter(adj.pvalue < alpha) tp <- sig %>% filter(ecoli) %>% nrow() From 9fc6a86663a801a259f4503c86ffb9593b590cf8 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 17:52:11 -0400 Subject: [PATCH 37/45] PR feedbacks --- benchmark/metamorpheus_Process.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R index cd6c4fba..035d45a9 100644 --- a/benchmark/metamorpheus_Process.R +++ b/benchmark/metamorpheus_Process.R @@ -20,13 +20,13 @@ calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0 filter(Organism == "Escherichia coli (strain K12)") %>% pull(`Protein Groups`) - comp <- model$ComparisonResult %>% + filtered_comparison_result <- model$ComparisonResult %>% mutate(ecoli = Protein %in% ecoli_ids) %>% filter(is.na(issue)) - labels <- unique(comp$Label) + labels <- unique(filtered_comparison_result$Label) result_rows <- lapply(labels, function(lbl) { - df <- comp %>% filter(Label == lbl) + df <- filtered_comparison_result %>% filter(Label == lbl) sig <- df %>% filter(adj.pvalue < alpha) tp <- sig %>% filter(ecoli) %>% nrow() From 8518c44a9a3d84b2f4bccd43e776fcf234e6a119 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 18:10:59 -0400 Subject: [PATCH 38/45] Changes for Unique Organisms --- benchmark/benchmark_Metamorpheus.R | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 33bb366d..95e855b7 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -29,7 +29,11 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) - protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) + + valid_organisms <- unique(input$Organism) + + protein_mappings = protein_mappings %>% + filter(Organism %in% valid_organisms) input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) From 8eefbff3faceee1f0b6416f5ee72ee95225d1d65 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 18:23:07 -0400 Subject: [PATCH 39/45] Changes for protein_mappings --- benchmark/benchmark_Metamorpheus.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 95e855b7..57c12be4 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -35,6 +35,8 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { protein_mappings = protein_mappings %>% filter(Organism %in% valid_organisms) + print(protein_mappings) + input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) output = MetamorpheusToMSstatsFormat(input, annot) From a9fbe803e52384f03b6951e69c32d0ecf0644cf7 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 18:50:55 -0400 Subject: [PATCH 40/45] Correction for Organism column --- benchmark/benchmark_Metamorpheus.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 57c12be4..3fd8c313 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -31,6 +31,10 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) valid_organisms <- unique(input$Organism) + + if (is.null(valid_organisms) || length(valid_organisms) == 0) { + valid_organisms <- unique(input$`Organism.y`) + } protein_mappings = protein_mappings %>% filter(Organism %in% valid_organisms) From 3be70e4534fe3b51b54ee38af72acdda7458b359 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Fri, 15 Aug 2025 19:02:45 -0400 Subject: [PATCH 41/45] Organisms --- benchmark/benchmark_Metamorpheus.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 3fd8c313..5c8040d8 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -31,11 +31,13 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) valid_organisms <- unique(input$Organism) - + if (is.null(valid_organisms) || length(valid_organisms) == 0) { valid_organisms <- unique(input$`Organism.y`) } + print("Organisms") + print(valid_organisms) protein_mappings = protein_mappings %>% filter(Organism %in% valid_organisms) From 85a41cde6878f0b34cd518df0646b9af8a2825af Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Mon, 18 Aug 2025 11:57:28 -0400 Subject: [PATCH 42/45] Changes for organisms column --- benchmark/benchmark_Metamorpheus.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 5c8040d8..5e43ba08 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -35,7 +35,8 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { if (is.null(valid_organisms) || length(valid_organisms) == 0) { valid_organisms <- unique(input$`Organism.y`) } - + + print(colnames(input)) print("Organisms") print(valid_organisms) protein_mappings = protein_mappings %>% From c4c0c86724fca9e1aa3f5e383f0a3ede737f37da Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Mon, 18 Aug 2025 12:34:05 -0400 Subject: [PATCH 43/45] Reverted changes --- benchmark/benchmark_Metamorpheus.R | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 5e43ba08..7d3cf645 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -30,17 +30,7 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) - valid_organisms <- unique(input$Organism) - - if (is.null(valid_organisms) || length(valid_organisms) == 0) { - valid_organisms <- unique(input$`Organism.y`) - } - - print(colnames(input)) - print("Organisms") - print(valid_organisms) - protein_mappings = protein_mappings %>% - filter(Organism %in% valid_organisms) + protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) print(protein_mappings) From f6ac63e219077da570d83a904345f8cf0e374be8 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Mon, 18 Aug 2025 13:13:26 -0400 Subject: [PATCH 44/45] Added new arguments --- benchmark/benchmark_Metamorpheus.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 7d3cf645..460c6ec4 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -36,7 +36,7 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - output = MetamorpheusToMSstatsFormat(input, annot) + output = MetamorpheusToMSstatsFormat(input, annot, removeFewMeasurements = FALSE, removeProtein_with1Feature = FALSE) data_process_tasks <- list( list( From 90cf018acd2c26c6a2db72ed01d81070cbc2c3f6 Mon Sep 17 00:00:00 2001 From: Anshuman Raina Date: Mon, 18 Aug 2025 15:01:31 -0400 Subject: [PATCH 45/45] Removed params --- benchmark/benchmark_Metamorpheus.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R index 460c6ec4..7d3cf645 100644 --- a/benchmark/benchmark_Metamorpheus.R +++ b/benchmark/benchmark_Metamorpheus.R @@ -36,7 +36,7 @@ runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) - output = MetamorpheusToMSstatsFormat(input, annot, removeFewMeasurements = FALSE, removeProtein_with1Feature = FALSE) + output = MetamorpheusToMSstatsFormat(input, annot) data_process_tasks <- list( list(