diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index f8d9dcf0..47296677 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -3,8 +3,8 @@ name: Run Simple R Script on HPC via Slurm on: push: branches: - # - feature/multiple-scripts - - devel + - feature/metamorpheus-scripts + # - devel jobs: Benchmarking-pipeline: @@ -21,24 +21,24 @@ jobs: mkdir -p ~/.ssh touch ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa - echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_rsa - ssh-keyscan -H login-00.discovery.neu.edu >> ~/.ssh/known_hosts || exit 1 + echo "${{ secrets.SSH_PRIVATE_KEY_EXPLORER }}" > ~/.ssh/id_rsa + ssh-keyscan -H login.explorer.northeastern.edu >> ~/.ssh/known_hosts || exit 1 - name: Transfer Files to HPC run: | - scp -O benchmark/benchmark_Dowell2021-HEqe408_LFQ.R benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R benchmark/scriptController.json benchmark/calculateMetrics.R benchmark/config.slurm raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking || exit 1 + scp -r benchmark raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking || exit 1 - name: Submit Slurm Job and Capture Job ID id: submit_job run: | - ssh raina.ans@login-00.discovery.neu.edu "cd /work/VitekLab/Projects/Benchmarking && sbatch config.slurm" | tee slurm_job_id.txt + ssh raina.ans@login.explorer.northeastern.edu "cd /projects/VitekLab/Projects/Benchmarking/benchmark && sbatch config.slurm" | tee slurm_job_id.txt slurm_job_id=$(grep -oP '\d+' slurm_job_id.txt) echo "Slurm Job ID is $slurm_job_id" echo "slurm_job_id=$slurm_job_id" >> $GITHUB_ENV - name: Monitor Slurm Job run: | - ssh raina.ans@login-00.discovery.neu.edu " + ssh raina.ans@login.explorer.northeastern.edu " while squeue -j ${{ env.slurm_job_id }} | grep -q ${{ env.slurm_job_id }}; do echo 'Job Id : ${{ env.slurm_job_id }} is still running...' sleep 10 @@ -48,8 +48,8 @@ jobs: - name: Fetch Output run: | - scp -O raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_output.txt job_output.txt - scp -O raina.ans@login-00.discovery.neu.edu:/work/VitekLab/Projects/Benchmarking/job_error.txt job_error.txt + scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/benchmark/job_output.txt job_output.txt + scp -O raina.ans@login.explorer.northeastern.edu:/projects/VitekLab/Projects/Benchmarking/benchmark/job_error.txt job_error.txt - name: Upload Output as Artifact uses: actions/upload-artifact@v4 diff --git a/benchmark/benchmark_Metamorpheus.R b/benchmark/benchmark_Metamorpheus.R new file mode 100644 index 00000000..7d3cf645 --- /dev/null +++ b/benchmark/benchmark_Metamorpheus.R @@ -0,0 +1,93 @@ +library(MSstatsConvert) +library(MSstats) +library(parallel) +library(stringr) +library(jsonlite) +library(dplyr) + +source("metamorpheus_Process.R") +config <- fromJSON("scriptController.json", simplifyVector = FALSE) + +runBenchmarkForMetaMorpheusData <- function(datasetPath, config) { + + dataset_config <- config$datasets[[datasetPath]] + dataset_config <- as.list(dataset_config) + + cat("Processing Dataset:", dataset_config$name, "\n") + + filePath <- file.path(dataset_config$parent, dataset_config$data) + annotPath <- dataset_config$parent + + input = data.table::fread(file.path(filePath, "QuantifiedPeaks.tsv")) + annot = data.table::fread(file.path(annotPath, "annotation.csv")) + + + cat("Dataset File Path:", filePath, "\n") + cat("Annotation File Path:", annotPath, "\n") + + input = input %>% filter(!str_detect(`Protein Group`, ";")) # remove multiple protein group in same cell + input = input %>% filter(!str_detect(`Protein Group`, "DECOY")) # remove decoys + + protein_mappings = data.table::fread(file.path(filePath, "QuantifiedProteins.tsv")) + + protein_mappings = protein_mappings %>% filter(Organism %in% c("Escherichia coli (strain K12)", "Homo sapiens")) + + print(protein_mappings) + + input = input %>% filter(`Protein Group` %in% protein_mappings$`Protein Groups`) + + output = MetamorpheusToMSstatsFormat(input, annot) + + data_process_tasks <- list( + list( + label = "Data process with Normalized Data", + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20) + ), + list( + label = "Data process with Normalization and MBImpute False", + result = function() dataProcess(output, featureSubset = "topN", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization", + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20) + ), + list( + label = "Data process without Normalization with MBImpute False", + result = function() dataProcess(output, featureSubset = "topN", normalization = "FALSE", n_top_feature = 20, MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for all features", + result = function() dataProcess(output, featureSubset = "all", normalization = "FALSE", MBimpute = FALSE) + ), + list( + label = "Data process without Normalization and Imputation On for top3 features", + result = function() dataProcess(output, featureSubset = "top3", normalization = "FALSE", MBimpute = FALSE) + ) + ) + + start_time <- Sys.time() + + num_cores <- detectCores() - 1 + + summarized_results <- mclapply(data_process_tasks, function(task) { + list(label = task$label, summarized = task$result()) + }, mc.cores = num_cores) + + + results_list <- mclapply(summarized_results, function(res) { + calculate_Metrics(res$summarized, protein_mappings, res$label) + }, mc.cores = num_cores) + + + final_results <- do.call(rbind, results_list) + end_time <- Sys.time() + total_time <- end_time - start_time + print(final_results) + print(paste("Total Execution Time:", total_time)) + +} + + + +runBenchmarkForMetaMorpheusData("DDA-Solivais2024-Metamorpheus_MBR_LFQ", config) +runBenchmarkForMetaMorpheusData("DDA-Solivais2024-Metamorpheus_NoMBR_LFQ", config) \ No newline at end of file diff --git a/benchmark/config.slurm b/benchmark/config.slurm index 28332e1a..5b4d610d 100644 --- a/benchmark/config.slurm +++ b/benchmark/config.slurm @@ -1,38 +1,51 @@ #!/bin/bash #SBATCH --job-name=msstats_benchmark_job_updated -#SBATCH --chdir=/work/VitekLab/Projects/Benchmarking/ +#SBATCH --chdir=/projects/VitekLab/Projects/Benchmarking/benchmark #SBATCH --output=job_output.txt #SBATCH --error=job_error.txt -#SBATCH --open-mode=append -#SBATCH --time=01:00:00 # Set the maximum run time -#SBATCH --ntasks=1 # Number of tasks (one process) -#SBATCH --cpus-per-task=8 # Use 8 CPU cores for the task -#SBATCH --mem=256G # Request 256GB of memory -#SBATCH --partition=short # Use the 'short' partition (or change as needed) +#SBATCH --time=01:00:00 +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --mem=128G +#SBATCH --partition=short -module load R-geospatial - -module load gcc/11.1.0 -module load cmake/3.23.2 +module load R +module load cmake/3.30.2 export LC_ALL=C -export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.2-geospatial +export R_LIBS_USER=/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4 +mkdir -p $R_LIBS_USER +mkdir -p $HOME/lib_fix +ln -sf /shared/EL9/explorer/R/4.4.1/lib64/R/lib/libRlapack.so $HOME/lib_fix/libRlapack.so.3 -mkdir -p $R_LIBS_USER +export LD_LIBRARY_PATH=$HOME/lib_fix:/shared/EL9/explorer/R/4.4.1/lib64/R/lib:/usr/lib64:$LD_LIBRARY_PATH -module load R -Rscript -e "if (!requireNamespace('remotes', quietly = TRUE)) install.packages('remotes', lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org'); \ -remotes::install_github('Vitek-Lab/MSstats', ref = 'devel', lib = Sys.getenv('R_LIBS_USER')); \ -remotes::install_github('Vitek-Lab/MSstatsConvert', ref = 'master', lib = Sys.getenv('R_LIBS_USER')); \ -install.packages(c('dplyr', 'stringr', 'ggplot2'), lib = Sys.getenv('R_LIBS_USER'), repos = 'https://cloud.r-project.org')" +Rscript -e ' +.libPaths("/home/raina.ans/R/x86_64-pc-linux-gnu-library/4.4") +Sys.unsetenv("R_LIBS") +cat(".libPaths():\n"); print(.libPaths()) + +if (!requireNamespace("remotes", quietly = TRUE)) + install.packages("remotes", repos = "https://cloud.r-project.org") + +install.packages("nloptr", type = "source", repos = "https://cloud.r-project.org") + +for (pkg in c("dplyr", "stringr", "ggplot2")) { + if (!requireNamespace(pkg, quietly = TRUE)) { + install.packages(pkg, repos = "https://cloud.r-project.org") + } +} +BiocManager::install(c("MSstatsConvert", "preprocessCore"), force = TRUE) +remotes::install_github("Vitek-Lab/MSstats", ref = "devel", force = TRUE) +' -R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R") +R_SCRIPTS=("benchmark_Dowell2021-HEqe408_LFQ.R" "benchmark_Puyvelde2022-HYE5600735_LFQ.R" "benchmark_Metamorpheus.R" ) for script in "${R_SCRIPTS[@]}"; do - echo "Executing script: $script" >> job_output.txt - Rscript "$script" >> job_output.txt 2>> job_error.txt - wait - echo "Finished executing script: $script" >> job_output.txt + echo "Executing script: $script" >> job_output.txt + stdbuf -oL -eL Rscript "$script" >> job_output.txt 2>> job_error.txt + wait + echo "Finished executing script: $script" >> job_output.txt echo -e "\n\n" done \ No newline at end of file diff --git a/benchmark/metamorpheus_Process.R b/benchmark/metamorpheus_Process.R new file mode 100644 index 00000000..035d45a9 --- /dev/null +++ b/benchmark/metamorpheus_Process.R @@ -0,0 +1,47 @@ +calculate_Metrics <- function(QuantData, protein_mappings, task_label, alpha = 0.05) { + comparison <- matrix( + c(-1,0,0,0,1, # E-A + -1,0,0,1,0, # D-A + -1,0,1,0,0, # C-A + -1,1,0,0,0), # B-A + nrow = 4, byrow = TRUE + ) + rownames(comparison) <- c("E-A", "D-A", "C-A", "B-A") + groups <- levels(QuantData$ProteinLevelData$GROUP) + colnames(comparison) <- groups[order(as.numeric(groups))] + + model <- groupComparison( + contrast.matrix = comparison, + data = QuantData, + use_log_file = FALSE + ) + + ecoli_ids <- protein_mappings %>% + filter(Organism == "Escherichia coli (strain K12)") %>% + pull(`Protein Groups`) + + filtered_comparison_result <- model$ComparisonResult %>% + mutate(ecoli = Protein %in% ecoli_ids) %>% + filter(is.na(issue)) + + labels <- unique(filtered_comparison_result$Label) + result_rows <- lapply(labels, function(lbl) { + df <- filtered_comparison_result %>% filter(Label == lbl) + sig <- df %>% filter(adj.pvalue < alpha) + + tp <- sig %>% filter(ecoli) %>% nrow() + fp <- sig %>% filter(!ecoli) %>% nrow() + tot <- tp + fp + fdr <- if (tot > 0) fp / tot else NA_real_ + + data.frame( + Task = task_label, + Comparison = lbl, + FDR = fdr, + stringsAsFactors = FALSE + ) + }) + + results <- do.call(rbind, result_rows) + return(results) +} diff --git a/benchmark/scriptController.json b/benchmark/scriptController.json index 9b6cbc06..449c667a 100644 --- a/benchmark/scriptController.json +++ b/benchmark/scriptController.json @@ -2,7 +2,7 @@ "datasets": { "DDA-Puyvelde2022-HYE5600735_LFQ": { "name": "DDA-Puyvelde2022-HYE5600735_LFQ", - "file": "/work/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats_fixed.csv", + "file": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Puyvelde2022/DDA-Puyvelde2022-HYE5600735_LFQ/FragPipe/TOP0/MSstats_fixed.csv", "samples": { "Human": { "pattern": "_HUMAN$", @@ -20,7 +20,37 @@ }, "DDA-Dowell2021-HEqe408_LFQ": { "name": "DDA-Dowell2021-HEqe408_LFQ", - "file": "/work/VitekLab/Data/MS/Benchmarking/DDA-Dowell2021-HEqe408_LFQ/FragPipe/TOP0/MSstats.csv", + "file": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Dowell2021-HEqe408_LFQ/FragPipe/TOP0/MSstats.csv", + "samples": { + "Human": { + "pattern": "_HUMAN$", + "type": "insignificant" + }, + "Ecoli": { + "pattern": "_ECOLI$", + "type": "significant" + } + } + }, + "DDA-Solivais2024-Metamorpheus_NoMBR_LFQ": { + "name": "DDA-Solivais2024-Metamorpheus_NoMBR_LFQ", + "parent": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current", + "data":"FlashLFQ_NoNormalization_NoPIP", + "samples": { + "Human": { + "pattern": "_HUMAN$", + "type": "insignificant" + }, + "Ecoli": { + "pattern": "_ECOLI$", + "type": "significant" + } + } + }, + "DDA-Solivais2024-Metamorpheus_MBR_LFQ": { + "name": "DDA-Solivais2024-Metamorpheus_MBR_LFQ", + "parent": "/projects/VitekLab/Data/MS/Benchmarking/DDA-Solivais2024_Metamorpheus/Current", + "data":"FlashLFQ_v1.0_NoNormalization_wPIP", "samples": { "Human": { "pattern": "_HUMAN$",