diff --git a/.gitignore b/.gitignore index ef11006e..74dbc757 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .Rproj.user docs +common_data .Rhistory -.DS_Store \ No newline at end of file +.DS_Store diff --git a/DESCRIPTION b/DESCRIPTION index 0fa9a949..a30025de 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,6 +40,7 @@ Imports: DBI, doParallel, dplyr, + DT, forcats, fs, furrr, @@ -51,6 +52,8 @@ Imports: here, htmltools, htmlwidgets, + httr, + httr2, igraph, latexpdf, magrittr, @@ -73,6 +76,7 @@ Imports: tibble, tidyr, tinytex, + tidyverse, tools, UpSetR, viridis, @@ -82,4 +86,5 @@ Imports: XVector, yaml Suggests: - knitr + knitr, + iprscanr diff --git a/NAMESPACE b/NAMESPACE index 6ae464f6..a5a44a79 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -49,6 +49,7 @@ export(findParalogs) export(formatJobArgumentsHTML) export(generateAllAlignments2FA) export(getAccNumFromFA) +export(getCaseStudyReport) export(getProcessRuntimeWeights) export(getTopAccByLinDomArch) export(mapAcc2Name) @@ -98,6 +99,18 @@ export(wordcloud3) export(writeMSA_AA2FA) export(writeProcessRuntime2TSV) export(writeProcessRuntime2YML) +import(Biostrings) +import(DT) +import(data.table) +import(dplyr) +import(httr) +import(iprscanr) +import(plotly) +import(readr) +import(rentrez) +import(stringr) +import(tidyverse) +import(visNetwork) importFrom(Biostrings,AAStringSet) importFrom(Biostrings,readAAStringSet) importFrom(Biostrings,toString) diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R new file mode 100644 index 00000000..ab496048 --- /dev/null +++ b/R/MolEvolData_class.R @@ -0,0 +1,719 @@ +# Author(s): Samuel Chen +# Last modified: 2020 + +#' @import Biostrings tidyverse + +setClass("MolEvolData", + slots = list( + df = "data.frame", + fasta_path = "character", + msa_path = "character", + ipr_path = "character", + queries = "character", + cln_path = "character", + domainSeqs = "character", + domain_ipr_path = "character" + ) +) +setClass("queryData", + slots = list( + df = "data.frame", + fasta_path = "character", + msa_path = "character", + ipr_path = "character", + queries = "character", + cln_path = "character" + ) +) +setClass("blastUpload", + slots = list( + df = "character", + seqs = "character" + ) +) +setClass("iprUpload", + slots = list( + df = "character", + seqs = "character" + ) +) +setClass("seqUpload", + slots = list( + seqs = "character" + ) +) + +combineFilesNopmap <- function(inpath, pattern, outpath, + delim = "\t", skip = 0, + col_names) { + source_files <- dir(path = inpath, pattern = pattern, recursive = T) + + source_files_path <- paste0(inpath, source_files) + + dt_list <- map(source_files_path, function(x) { + if (!grepl("query", x)) { + fread(x, sep = delim, skip = skip, fill = T) # , col.names = col_names) + } + }) + + combined <- rbindlist(dt_list, fill = T) + + fwrite(combined, outpath, sep = "\t") + + return(combined) +} + + +full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x", + "TaxID.x", "Lineage", "PcPositive", "PcIdentity", + "AlnLength", "SAccNum", "SAllSeqID", "Mismatch", + "GapOpen", "QStart", "QEnd", "QLength", "SStart", + "SEnd", "SLength", "EValue", + "BitScore", "PcPosOrig", + "Name", "ClusterID", "DomArch.Pfam", + "DomArch.SMART", + "DomArch.CDD", "DomArch.TIGRFAM", + "DomArch.Phobius", + "DomArch.Gene3D", "DomArch.TMHMM", + "DomArch.SignalP_EUK", + "DomArch.SignalP_GRAM_NEGATIVE", + "DomArch.SignalP_GRAM_POSITIVE", + "Description", "Length", "TaxID.y", + "Species.y", "SourceDB", "Completeness") + + +processWrapperDir <- function(path, pinName, type = "full") { + if (type == "full") { + if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + query_data$Query <- query_data$AccNum + queries <- unique(query_data$Query) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>% + distinct(QueryName, .keep_all = TRUE) + query_wrapper_data <- new("queryData", + df = query_data, fasta_path = query_sequence_path, + msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path + ) + com_blast_data <- read_tsv(path, "/blast_combined.tsv") %>% arrange(desc(PcPositive)) + ipr_blast_path <- paste0(path, "/ipr_combined.tsv") + queries <- unique(com_blast_data$Query) + com_blast_data$Lineage <- as.factor(com_blast_data$Lineage) + com_blast_data <- com_blast_data %>% select(QueryName, everything()) + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path, + fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = "" + ) + + return(list(wrapper_data, query_wrapper_data)) + } else { + + # Get the query data + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + query_data$Query <- query_data$AccNum + # alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path) + queries <- unique(query_data$Query) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>% + distinct(QueryName, .keep_all = TRUE) + query_wrapper_data <- new("queryData", + df = query_data, fasta_path = query_sequence_path, + msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path + ) + + query_data <- query_data %>% select(Query, QueryName) + + com_blast_path <- paste0(path, "/blast_combined.tsv") + ipr_blast_path <- paste0(path, "/ipr_combined.tsv") + com_blast_data <- combineFilesNopmap(paste0(path, "/"), + pattern = "*.full_analysis.tsv", skip = 0, + col_names = c(), outpath = com_blast_path, delim = "\t" + ) + + ipr_blast_data <- combineFilesNopmap(paste0(path, "/"), + pattern = "*.iprscan_cln.tsv", skip = 0, + col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t" + ) + com_blast_data <- merge(com_blast_data, query_data, by = "Query") + com_blast_data <- com_blast_data %>% arrange(desc(PcPositive)) + fwrite(com_blast_data, com_blast_path, sep = "\t") + + queries <- unique(com_blast_data$Query) + + com_blast_data$Lineage <- as.factor(com_blast_data$Lineage) + com_blast_data <- com_blast_data %>% select(QueryName, everything()) + + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path, + fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = "" + ) + + return(list(wrapper_data, query_wrapper_data)) + } + } else if (type == "dblast") { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% mutate(Query = AccNum) + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), + Query, Name, AccNum) %>% + distinct(QueryName, .keep_all = TRUE) + queries <- unique(query_data$Query) + query_wrapper_data <- new("queryData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path + ) + query_data <- query_data %>% select(Query, QueryName) + com_blast_path <- paste0(path, "/blast_combined.tsv") + com_blast_data <- combineFilesNopmap(paste0(path, "/"), + pattern = "*.blast.cln.tsv", + skip = 0, + col_names = c(), + outpath = com_blast_path, + delim = "\t" + ) + com_blast_data <- merge(com_blast_data, query_data, by = "Query") + queries <- unique(com_blast_data$Query) + com_blast_data$Lineage <- as.factor(com_blast_data$Lineage) + com_blast_data <- com_blast_data %>% + select(QueryName, everything()) %>% + arrange(desc(PcPositive)) + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, + ipr_path = "", cln_path = com_blast_path, + fasta_path = query_sequence_path, + msa_path = query_msa_path, domainSeqs = "", + domain_ipr_path = "" + ) + return(list(wrapper_data, query_wrapper_data)) + } else if (type == "phylo") { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% mutate(Query = AccNum) + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), + Query, Name, AccNum) %>% + distinct(QueryName, .keep_all = TRUE) + queries <- unique(query_data$Query) + query_wrapper_data <- new("queryData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path + ) + wrapper_data <- new("MolEvolData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path, + domainSeqs = "", domain_ipr_path = "" + ) + return(list(wrapper_data, query_wrapper_data)) + } else if (type == "da") { + query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv")) + query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv") + query_msa_path <- paste0(path, "/query_data/query_seqs.msa") + query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa") + alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path) + cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv") + query_data <- query_data %>% + mutate(QueryName = Name) %>% + select(QueryName, Species, contains("Lineage"), contains("DomArch"), + Query, Name) %>% + distinct(QueryName, .keep_all = TRUE) + queries <- unique(query_data$Query) + query_wrapper_data <- new("queryData", + df = query_data, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + ipr_path = query_ipr_path, + queries = queries, cln_path = cln_path + ) + com_blast_path <- paste0(path, "/blast_combined.tsv") + com_blast_data <- combineFilesNopmap(paste0(path, "/"), + pattern = "*.full_analysis.tsv", + skip = 0, + col_names = c(), + outpath = com_blast_path, + delim = "\t" + ) + com_blast_data <- merge(com_blast_data, query_data, by = "Query") + com_blast_data <- com_blast_data %>% + select(QueryName, everything()) %>% + arrange(desc(PcPositive)) + wrapper_data <- new("MolEvolData", + df = com_blast_data, queries = queries, + ipr_path = "", cln_path = com_blast_path, + fasta_path = query_sequence_path, + msa_path = query_msa_path, + domainSeqs = "", domain_ipr_path = "" + ) + return(list(wrapper_data, query_wrapper_data)) + } else { + stop("Unrecognized type. Please use 'full', 'dblast', 'phylo', or 'da'.") + } +} + +drop_empty <- function(df) { + for (c in colnames(df)) { + if (all(is.na(df[, ..c])) || all(df[, ..c] == "")) { + df <- df %>% select(-contains(c)) + } + } + return(df) +} + + +clean_fetched <- function(df) { + df <- as.data.frame(df) + # df = df %>% cleanup_species() + + # Cleanup domarchs + cols <- colnames(df) + for (c in cols) { + if (grepl("^DomArch", c)) { + # Repeats + old <- c + new <- paste0(c, ".repeats") + df <- df %>% cleanup_domarch( + old = old, new = new, + domains_rename = NULL, # domains_rename, + domains_keep = NULL, # filter applied to only ClustName for now. + domains_ignore = NULL, # !! should check and remove soon! + repeat2s = FALSE, + remove_tails = F, # new! check below if it works! + remove_empty = F + ) + + new <- c + df <- df %>% cleanup_domarch( + old = old, new = new, + domains_rename = NULL, # domains_rename, + domains_keep = NULL, # filter applied to only ClustName for now. + domains_ignore = NULL, # !! should check and remove soon! + repeat2s = TRUE, + remove_tails = F, # new! check below if it works! + remove_empty = F + ) + } + } + + + return(df) +} + +# Description +# validation functions for accession numbers/headers for FASTA (validateAccNumFasta) +# & accession number input (validateAccNum) submission types. + +library(Biostrings) +library(httr) +library(httr2) +library(rentrez) +library(shiny) + +getAccNumFromFasta <- function(biostrings_aa_string_set, verbose = FALSE) { + # parsing/cleaning accession numbers using the same methods as + # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function + accnums <- c() + for (header in names(biostrings_aa_string_set)) { + # case 1 UnitProtKB formatted FASTA + # header: https://www.uniprot.org/help/fasta-headers + if (grepl("\\|", header)) { + accnum <- unlist(strsplit(header, "\\|"))[2] + # case 2 NCBI formatted FASTA + # header: https://www.ncbi.nlm.nih.gov/genbank/fastaformat + } else if (grepl(" ", header)) { + accnum <- unlist(strsplit(header, " "))[1] + # case 3 neither delimiter present; use the whole header + } else { + accnum <- header + } + # print debug info + if (verbose == TRUE) { + cat("header:", header, "\n") + cat("accnum_parsed:", accnum, "\n") + } + accnums <- append(accnums, accnum) + } + print(accnums) + return(accnums) +} + +validateAccNumFasta <- function(text) { + # INPUT: text from input object that houses FASTA data + # validate the headers/accnums for FASTA submission + # Return: + # T: when no duplicate accnums after parsing + # F: when duplicate accnums are present after parsing + + # TRY write tmp file for biostrings parsing + path <- tryCatch( + expr = { + path <- tempfile() + write(text, path) + path + }, + error = function(e) { + NULL + } + ) + if (is.null(path)) { + cat("failed to write temp fasta during accnum validation\n", file = stderr()) + return(FALSE) # validation fail + } + + # TRY read seq + fasta <- tryCatch( + expr = { + fasta <- Biostrings::readAAStringSet(path) + fasta + }, + error = function(e) { + NULL + }, + finally = { + unlink(path) + } + ) + if (is.null(fasta)) { + cat("failed to read sequence during accnum validation\n", file = stderr()) + return(FALSE) # validation fail + } + + # parse accnums from fasta + accnums <- getAccNumFromFasta(fasta) + tb_accnum_counts <- tibble("frequencies" = table(accnums)) + # check for duplicates + if (any(tb_accnum_counts$frequencies > 1)) { + cat("duplicate headers found during fasta validation\n", file = stderr()) + return(FALSE) # validation fail + } else { + return(TRUE) + } +} + + +# Test whether a single accession returns a valid protein from Entrez +isAccNumValidEntrez <- function(accnum, verbose = FALSE) { + # empty accnum wil not raise an error from efetch, so test for this first + if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} + + # try performing a POST of the accnum, followed by a GET request for a protein + # sequence + # rentrez::entrez_fetch() returns an error when there's no protein, + # so we're using try statements to handle this and return FALSE upon error + result <- tryCatch( + expr = { + result <- rentrez::entrez_fetch(db = "protein", id = accnum, rettype = "fasta") + if (verbose) {print(result)} + TRUE + }, + error = function(e) {if (verbose) {print(e)}; FALSE} + ) + return(result) +} + +# Test whether a single accession returns a valid protein from EBI +isAccNumValidEbi <- function(accnum, verbose = FALSE) { + # validation: ensure there's some text to parse + if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)} + # construct a httr2 request to POST an accession number, then GET the fasta + url_base <- "https://www.ebi.ac.uk/proteins/api/proteins?accession=" + url_protein <- paste0(url_base, accnum) + req <- httr2::request(url_protein) |> + httr2::req_headers("Accept" = "text/x-fasta", "Content-type" = "application/x-www-form-urlencoded") + + # wrap in try since a failed HTTP request will raise an R error + try(httr2::req_perform(req)) + # get the HTTP response code + resp <- httr2::last_response() + # assign result based on HTTP response code ('200' is a success) + result <- ifelse(resp$status == 200, TRUE, FALSE) + if (verbose) { + msg <- stringr::str_glue( + "EBI protein query results:\n", + "\taccnum: {accnum}\t validation result: {result}\n" + ) |> print() + } + return(result) +} + +# Perform a series of API reqs using NCBI entrez to validate accession numbers +performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { + # API guidelines docs + # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html + # entrez recommends no more than 3 POSTs per second + # simple method, sleep for 1 second after every 3rd POST + i <- 0 + results <- vapply( + X = accnums, + FUN = function(accnum) { + result <- isAccNumValidEntrez(accnum, verbose = TRUE) + i <<- i + 1L + if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L} + if (track_progress) {incProgress(1)} + result + }, + FUN.VALUE = logical(1) + ) + return(results) +} + +performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) { + # API guidelines docs + # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html + # EBI allows 200 POSTs per second + # simple method, sleep for 1 second after every 200th POST + i <- 0 + results <- vapply( + X = accnums, + FUN = function(accnum) { + result <- isAccNumValidEbi(accnum, verbose = TRUE) + i <<- i + 1L + if (i >= 200L) {Sys.sleep(1); i <<- 0L} + if (track_progress) incProgress(1) + result + }, + FUN.VALUE = logical(1) + ) + return(results) +} + +# Validate accession numbers from MolEvolvR user input +validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) { + # API guidelines docs + # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen + # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html + + # get accnums from three possible delimiters + # 1. comma separated values with any amount of space between them + # 2. one or more space delimiter + # 3. "\n" (newline) delimiter + accnums <- unlist(strsplit(text, "\\s*,\\s*|\\s+|\\n|,")) + + warning() + + if (track_progress) {incProgress(1 / n_steps, + message = "Testing for duplicate accession numbers . . .")} + # fail fast for duplicates + if (any(duplicated(accnums))) { + warning("duplicate accesion numbers found") + if (track_progress) {setProgress(n_steps, + message = "Duplicate accession numbers found")} + return(FALSE) + } + if (track_progress) {incProgress(1 / n_steps, + message = "Please wait (searching NCBI's protein database) . . .")} + # entrez API + entrez_results <- performEntrezReqs(accnums, verbose = verbose) + if (verbose) { + stringr::str_glue("rentrez + results:\n\t{paste0(entrez_results, collapse=",")}\n") |> + print() + } + + # EBI API + # if all of entrez resulted in success, then skip EBI. Else, try EBI + if (!all(entrez_results)) { + ebi_results <- performEbiReqs(accnums, verbose = verbose) + if (verbose) { + stringr::str_glue("ebi results:\n\t{paste0(ebi_results, + collapse=",")}\n") |> + print() + } + if (track_progress) {incProgress(1 / n_steps, message = "Please wait + (searching EBI's protein database) . . .")} + # OR test each accnum result across both dbs + final_results <- ebi_results | entrez_results + } else { + if (track_progress) {setProgress(n_steps, message = "All accession + numbers validated")} + final_results <- entrez_results + } + failed_accnums <- accnums[which(!final_results)] + if (length(failed_accnums) >= 1) { + msg <- stringr::str_glue( + "The following accnums failed validation:\n", + "\t{paste0(failed_accnums, collapse=",")}\n" + ) + warning(msg) + if (track_progress) {setProgress(n_steps, message = "Accession number + validation failed")} + } + setProgress(n_steps, message = "Accession number validation finished") + return( + list( + "validation_result" = all(final_results), + "failed_accnums" = failed_accnums + ) + ) +} + +#=============================================================================== +# Description +# validation function for evalue input +#=============================================================================== +validateEvalue <- function(input_value) { + is_valid_evalue <- is.numeric(input_value) && input_value != 0 + return(is_valid_evalue) +} + +#=============================================================================== +# Author(s): JK +# Last modified: 2023_06 +#=============================================================================== +#------------------------------------------------------------------------------- +guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) { + tb <- as_tibble(alphabetFrequency(single_fasta)) + n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0 + + aa_cols <- setdiff(Biostrings::AA_ALPHABET, c("*", ".")) + tb_dna <- tb %>% select(all_of(Biostrings::DNA_ALPHABET)) + tb_aa <- tb %>% select(all_of(aa_cols)) + + total <- nchar(single_fasta) + est_dna_prop <- sum(unlist(tb_dna)) / total + est_aa_prop <- sum(unlist(tb_aa)) / total + other_prop <- n_other / total + cat( + names(single_fasta), "\n", + "estimated DNA alphabet proportion:", est_dna_prop, "\n", + "estimated AA alphabet proportion:", est_aa_prop, "\n", + "'other' alphabet proportion:", other_prop, "\n" + ) + + if (est_dna_prop >= dna_guess_cutoff) { + guess <- "DNA" + } else if (other_prop >= other_guess_cutoff) { + guess <- NA + } else { + guess <- "AA" + } + cat("Guess:", guess, "\n") + return(guess) +} + +validateSeqBody <- function(text) { + # convert string to single letter character vector + individual_chars <- unlist(strsplit(text, "")) + # return the characters from input that are NOT in the AA_ALPHABET + invalid_chars <- setdiff(individual_chars, Biostrings::AA_ALPHABET) + n_invalid_chars <- length(invalid_chars) + + # 0 length character vector means the sequence contains + # characters that are all present in the AA_ALPHABET + # >0 length character vector means there are some characters + # which are NOT found in the AA_ALPHABET + if (n_invalid_chars == 0) { + return(TRUE) + } else { + warning("A sequence contains characters not found in the AA_ALPHABET") + return(FALSE) + } +} + +#------------------------------------------------------------------------------- +validateFasta <- function(fasta_path, .type = "AA") { + # handle case of fasta being unreadable/unrecognized format + fasta <- tryCatch( + expr = Biostrings::readAAStringSet(fasta_path), + error = function(e) { + warning(paste0("error: failed to read sequence from: ", fasta_path)) + return(NULL) + } + ) + # failure to read yields quick return + if (is.null(fasta)) { + return(FALSE) # validation fail + } + + # require at least some characters for the sequence + if (!(sum(nchar(unlist(fasta))) >= 1)) { + warning(paste0("fasta does not have any sequence characters")) + return(FALSE) # validation fail + } + print(fasta) + + ### validate sequence BODY + # iteratively return boolean value for valid seq body (FALSE = invalid) + seq_body_validations <- c() + for (i in seq_along(1:length(fasta))) { + seq_body <- as.character(fasta[i]) + seq_body_validations <- c(validateSeqBody(seq_body), seq_body_validations) + } + # require all sequences to have chars only in AA_ALPHABET + if (!(all(seq_body_validations))) { + warning("At least one sequence has non-IUPAC AA characters") + return(FALSE) # validation fail + } + + ### validate sequence TYPE + # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file) + seq_types <- c() + for (idx in seq_along(1:length(fasta))) { + seq_types <- c(guessSeqType(fasta[idx]), seq_types) + } + + switch(.type, + "DNA" = { + anti_type <- "AA" + }, + "AA" = { + anti_type <- "DNA" + } + ) + + # if all seqs are of desired .type, only then TRUE + if (sum(is.na(seq_types)) != 0) { + return(FALSE) # validation fail + } else if (anti_type %in% seq_types) { + return(FALSE) # validation fail + } else { + return(TRUE) # validation success + } +} + +# Author(s): Samuel Chen, Lo Sosinski, Joe Burke +# Last modified: 2021 + +alpha_numeric <- c(0:9, letters, LETTERS) + +rand_string <- function(length, characters = alpha_numeric, post_fix = "", ignorelist = c()) { + str <- NULL + + while (is.null(str) || (str %in% ignorelist)) { + chars <- sample(characters, size = length, replace = TRUE) + str <- paste0(paste(chars, collapse = ""), post_fix) + } + + return(str) +} +strsplit_vect <- function(strings, pattern = "", pos) { + split_strings <- map(strsplit(strings, "_"), function(x) x[1]) %>% unlist() + + return(split_strings) +} diff --git a/R/combine_files.R b/R/combine_files.R index 4f03b1d2..1c5ac6a2 100755 --- a/R/combine_files.R +++ b/R/combine_files.R @@ -24,22 +24,22 @@ #' #' @author Janani Ravi #' -#' @param inpath Character. The master directory path where the files reside. +#' @param inpath Character. The master directory path where the files reside. #' The search is recursive (i.e., it will look in subdirectories as well). -#' @param pattern Character. A search pattern to identify files to be combined. +#' @param pattern Character. A search pattern to identify files to be combined. #' Default is "*full_analysis.tsv". -#' @param delim Character. The delimiter used in the input files. -#' Default is tab ("\t"). -#' @param skip Integer. The number of lines to skip at the beginning of each file. +#' @param delim Character. The delimiter used in the input files. +#' Default is tab \code{"\t"}. +#' @param skip Integer. The number of lines to skip at the beginning of each file. #' Default is 0. -#' @param col_names Logical or character vector. If TRUE, the first row of each file -#' is treated as column names. Alternatively, a character vector can +#' @param col_names Logical or character vector. If TRUE, the first row of each file +#' is treated as column names. Alternatively, a character vector can #' be provided to specify custom column names. #' #' @importFrom purrr pmap_dfr #' @importFrom readr cols #' -#' @return A data frame containing the combined contents of all matched files. +#' @return A data frame containing the combined contents of all matched files. #' Each row will include a new column "ByFile" indicating the source file of the data. #' #' @export diff --git a/R/generate_report.R b/R/generate_report.R new file mode 100644 index 00000000..65b8364d --- /dev/null +++ b/R/generate_report.R @@ -0,0 +1,740 @@ +# Author(s): Awa Synthia +# Last modified: 2024 + +# import libs +#' @import readr stringr + +#' @export +# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams") +getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) { + cat("\n") + cat("\033[1;36m") # Cyan for the "MolEvolvR" header + cat("=========================================\n") + cat(" M o l E v o l R\n") + cat("=========================================\n") + cat("\033[0m") + + cat("\033[1;32m") + cat("\n>>>>> Case Study Report Generation started... <<<<<\n") + cat("\033[0m") + + # Step 1: Fetch the FASTA sequences of the given pathogen and drug + getCardData(pathogen, drug) + + # Check if the FASTA file exists after fetching + fasta_file <- "filtered_proteins.fasta" + if (!file.exists(fasta_file)) { + stop("Failed to retrieve FASTA sequences. The file + 'filtered_proteins.fasta' does not exist.") + } + # Step 2: Once the FASTA file is ready, call runAnalysis to run pipeline + message("FASTA file downloaded, running analysis...") + + # Pass the file to runAnalysis function + runAnalysis(file_paths = list(fasta = fasta_file), ...) + + message("Case study report completed") +} + +# get fasta of pathogen and/or drug +getCardData <- function(pathogen = NULL, drug = NULL) { + destination_dir <- "CARD_data" + # Check if CARD data exists + if (!dir.exists(destination_dir)) { + dir.create(destination_dir) + } + if (!file.exists("CARD_data/aro_index.tsv")) { + # Step 1: Download CARD Data + download.file("https://card.mcmaster.ca/latest/data", "card_data.tar.bz2") + #unzip("card_data.tar.bz2", exdir = "CARD_data") + system(paste("tar -xjf card_data.tar.bz2 -C", destination_dir)) + } + + # Step 2: Open ARO_index.tsv + aro_data <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE) + + # Step 3: Map CARD Short Name + antibiotics <- read.delim("CARD_data/shortname_antibiotics.tsv", + sep = "\t", header = TRUE) + pathogens <- read.delim("CARD_data/shortname_pathogens.tsv", + sep = "\t", header = TRUE) + + aro_data <- aro_data %>% + mutate( + pathogen = str_extract(CARD.Short.Name, "^[^_]+"), + gene = str_extract(CARD.Short.Name, "(?<=_)[^_]+"), + drug = str_extract(CARD.Short.Name, "(?<=_)[^_]+$") + ) %>% + left_join(pathogens, by = c("pathogen" = "Abbreviation")) %>% + left_join(antibiotics, by = c("drug" = "AAC.Abbreviation")) + + # Sort names + aro_data <- aro_data %>% + arrange(Pathogen, Molecule) %>% + group_by(Pathogen, Molecule) + + # Filter data based on user input + filtered_data <- aro_data + + if (!is.null(pathogen)) { + filtered_data <- filtered_data %>% filter(Pathogen == !!pathogen) + } + + if (!is.null(drug)) { + filtered_data <- filtered_data %>% filter(Molecule == !!drug) + } + + # Check if filtered data is empty + if (nrow(filtered_data) == 0) { + stop("No data found for the specified pathogen and drug.") + } + + # Extract protein accessions + accessions <- filtered_data$Protein.Accession + + # Function to fetch FASTA sequence from NCBI + get_fasta <- function(accession) { + entrez_fetch(db = "protein", id = accession, rettype = "fasta") + } + + # Download sequences + fasta_sequences <- lapply(accessions, get_fasta) + + # Write sequences to a file + writeLines(unlist(fasta_sequences), "filtered_proteins.fasta") + + return("FASTA sequences downloaded to 'filtered_proteins.fasta'.") +} + +# Run analysis +runAnalysis <- function( + upload_type = "Fasta", + evalue = 0.00001, + accnum_fasta_input = "", + file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(), + blast = NULL, iprscan = NULL), + blast_db = "refseq_protein", + blast_hits = 100, + blast_eval = 0.0001, + acc_homology_analysis = TRUE, + acc_da_analysis = TRUE, + acc_phylogeny_analysis = FALSE, + report_template_path = system.file("report", "report_template.Rmd", package = "MolEvolvR", mustWork = TRUE), + output_file = file.path(getwd(), "report.html"), + DASelect = "All", + mainSelect = NULL, + PhyloSelect = "All", + q_heatmap_select = "All", + DACutoff = 95, + GCCutoff = 0.5, + query_select = NULL, + query_iprDatabases = c( + "PfamA", "SMART", "Phobius", + "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE", + "SuperFamily", "MobiDBLite", "Panther", "Coils" + ), + query_iprVisType = "Analysis", tree_msa_tool = "ClustalO", + levels = 2, + DA_Col = "DomArch.PfamA", + msa_rep_num = 10, + msa_reduce_by = "Species", + rval_phylo = FALSE, + DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"), + networkLayout = c("nice", "grid", "circle", "random"), + phylo_select = "All", + ... + +) { + + ##### Initialize Variables and Classes ##### + blast_upload_data <- new("blastUpload", df = "", seqs = "") + ipr_upload_data <- new("iprUpload", df = "", seqs = "") + sequence_upload_data <- new("seqUpload", seqs = "") + data <- new("MolEvolData", msa_path = tempfile(), + fasta_path = file_paths$fasta) + app_data <- new("MolEvolData", msa_path = tempfile(), + fasta_path = file_paths$fasta) + query_data <- new("queryData") + + if (length(upload_type) != 1) { + stop("upload_type must be a single value.") + } + + # Reset default analysis function + resetSettings <- function() { + acc_homology_analysis <<- FALSE + acc_da_analysis <<- FALSE + acc_phylogeny_analysis <<- FALSE + domain_split <<- FALSE + } + + fasta_set <- c("Fasta", "AccNum", "MSA") + + # Update settings based on upload type + updateUploadType <- function(upload_type) { + switch(upload_type, + "Fasta" = { + resetSettings() + acc_homology_analysis <<- TRUE + acc_da_analysis <<- TRUE + }, + "AccNum" = { + resetSettings() + acc_homology_analysis <<- TRUE + acc_da_analysis <<- TRUE + }, + "MSA" = { + resetSettings() + acc_homology_analysis <<- TRUE + acc_da_analysis <<- TRUE + }, + "BLAST Output" = { + resetSettings() + acc_phylogeny_analysis <<- TRUE + }, + "InterProScan Output" = { + resetSettings() + acc_da_analysis <<- TRUE + } + ) + } + + updateUploadType(upload_type) + + ####### File Upload Functions ######## + + # Update function to return modified object + uploadAccNumFile <- function(file_path, seq_obj) { + accnum_data <- read_file(file_path) + seq_obj@seqs <- accnum_data + return(seq_obj) + } + + uploadFastaFile <- function(file_path, seq_obj) { + fasta_data <- read_file(file_path) + seq_obj@seqs <- fasta_data + return(seq_obj) + } + + uploadMSAFile <- function(file_path, seq_obj) { + msa_data <- read_file(file_path) + seq_obj@seqs <- msa_data + return(seq_obj) + } + + uploadBlastFile <- function(file_path, blast_obj) { + blast_obj@df <- file_path + return(blast_obj) + } + + uploadIPRScanFile <- function(file_path, ipr_obj) { + ipr_obj@df <- file_path + return(ipr_obj) + } + + # Now modify the calling section to reassign the modified objects + if (!is.null(file_paths$accnum)) { + sequence_upload_data <- uploadAccNumFile(file_paths$accnum, + sequence_upload_data) + } + if (!is.null(file_paths$fasta)) { + sequence_upload_data <- uploadFastaFile(file_paths$fasta, + sequence_upload_data) + } + if (!is.null(file_paths$msa)) { + sequence_upload_data <- uploadMSAFile(file_paths$msa, + sequence_upload_data) + } + if (!is.null(file_paths$blast)) { + blast_upload_data <- uploadBlastFile(file_paths$blast, + blast_upload_data) + } + if (!is.null(file_paths$iprscan)) { + ipr_upload_data <- uploadIPRScanFile(file_paths$iprscan, + ipr_upload_data) + } + + # Validation of inputs + fasta_data <- read_file(file_paths$fasta) + validate_and_process_inputs <- function(evalue, fasta_data) { + if (!validateEvalue(evalue)) { + return("Error: A numeric E-value is required. Please set a valid + value (e.g., 0.0001).") + } + + + if (!validateAccNumFasta(fasta_data)) { + return("Error: Input for AccNum/Fasta cannot be empty or invalid.") + } + + sequence_vector <- unlist(strsplit(fasta_data, "\n")) + return(list(message = "Inputs are valid!", sequences = sequence_vector)) + } + + validation_result <- validate_and_process_inputs(evalue, fasta_data) + if (is.character(validation_result)) { + return(validation_result) + } + + # Phylogenetic Analysis Validation + phylo <- acc_phylogeny_analysis + if (phylo) { + # Validate phylogenetic analysis based on upload type + is_valid_phylo <- switch( + upload_type, + "Fasta" = { + str_count(string = sequence_upload_data@seqs, ">") > 1 + }, + "AccNum" = { + accnums <- sequence_upload_data@seqs |> + strsplit("\\s*,\\s*|\\s+|\\n|,") |> + unlist() + length(accnums) > 1 + }, + "MSA" = { + str_count(string = sequence_upload_data@seqs, ">") > 1 + }, + FALSE # Default to FALSE if no valid type is found + ) + + if (!is_valid_phylo) { + return("Error: At least two sequences/identifiers are required for + phylogenetic analysis.") + } + } + # Fasta-like submissions + if (upload_type %in% fasta_set) { + # Can have any combination of select options + type <- "" + script <- "" + postfix <- "" + + if (acc_homology_analysis && !phylo && acc_da_analysis) { + type <- "full" + postfix <- "full" + } + # Phylogenetic analysis, do full script but skip blast + else if (phylo && acc_da_analysis) { + type <- "phylo" + postfix <- "phylo" + } + else if (acc_da_analysis) { + type <- "da" + postfix <- "da" + } + else if (acc_homology_analysis) { + # Only run BLAST + type <- "dblast" + postfix <- "dblast" + } + else { + # Something went wrong, throw error + stop("Please select one of the above analyses (full, phylo, da, + dblast) to run.") + + return() + } + } + + # After uploading the sequence data, you would check the uploaded data + if (sequence_upload_data@seqs == "") { + stop("Error: Please upload a protein sequence.") + } + OUT_PATH <- getwd() + unavailable_pins <- list.files(OUT_PATH) + unavailable_pins <- strsplit_vect(unavailable_pins, pattern = "_") + pinName <- rand_string(length = 6, post_fix = "", ignorelist = unavailable_pins) + pin_id <- strtrim(pinName, 6) + + dir <- paste0(OUT_PATH, pin_id, "_", postfix) + path <- paste0(dir, "/", pin_id, ".fa") + # Adding validation logic based on upload type + system(paste0("mkdir ", dir), wait = TRUE) + switch(upload_type, + "Fasta" = { + # Validate sequence limit + if (str_count(sequence_upload_data@seqs, ">") > 200) { + stop("Error: Only submissions with less than 200 proteins + are accepted at this time. For analyses with more than 200 p + roteins please contact janani.ravi@cuanschutz.edu.") + } + + # Validate accession numbers for FASTA submission + if (!(validateAccNumFasta(sequence_upload_data@seqs))) { + stop("Error: Please adjust the FASTA headers. Ensure a header + line for each sequence, no duplicate header names, and + no duplicate protein accession numbers.") + } + + # TRY load fasta for fasta validation + is_valid_aa_fasta <- tryCatch( + expr = { + tmp_file <- tempfile() + writeLines(sequence_upload_data@seqs, tmp_file) + validateFasta(tmp_file) + }, + error = function(e) { + warning("Error: Failed to run input FASTA verification.") + return(FALSE) # Return FALSE if an error occurs + }, + finally = { + unlink(tmp_file) + } + ) + + # Validate AA fasta + if (!(is_valid_aa_fasta)) { + stop("Error: The FASTA input could not be recognized as valid + Amino Acid (AA) sequence(s). MolEvolvR only accepts FASTA + formatted AA/protein sequences as input (not DNA/RNA). + For a short reference on FASTA format, review this + https://blast.ncbi.nlm.nih.gov/doc/blast-topics/short NCBI guide.") + } + + write(sequence_upload_data@seqs, path) + }, + "AccNum" = { + # Convert Acc to FASTA + accnum_vect <- unlist(strsplit(sequence_upload_data@seqs, "\\s*,\\s*|\\s+|\\n|,")) + if (length(accnum_vect) > 200) { + stop("Error: Only submissions with less than 200 proteins are + accepted at this time. For analyses with more than 200 + proteins please contact janani.ravi@cuanschutz.edu.") + } + + # if an error is raised, validation fails + validation_results <- purrr::map_lgl( + accnum_vect, + function(accnum) { + tryCatch( + expr = { + tmp <- tempfile( + pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"), + fileext = ".fa" + ) + acc2FA(accnum, tmp) + readAAStringSet(tmp) + TRUE + }, + error = function(e) { + FALSE + }, + finally = { + suppressWarnings(sink()) + unlink(tmp) + } + ) + } + ) + + # If any accession numbers fail, halt submission + if (!all(validation_results)) { + stop("Error: MolEvolvR could not locate sequences for the + following accession numbers: + [ {paste0(accnum_vect[which(!validation_results)], collapse = ', ')} ] + Please try submitting FASTA sequences instead.") + } else { + # Write a multifasta + acc2FA(accnum_vect, outpath = path) + } + }, + "MSA" = { + # Validate sequence limit + if (str_count(sequence_upload_data@seqs, ">") > 200) { + stop("Error: Only submissions with less than 200 proteins + are accepted at this time. For analyses with more than 200 + proteins please contact janani.ravi@cuanschutz.edu.") + } + + msa_temp <- str_replace_all(sequence_upload_data@seqs, "-", "") + write(msa_temp, path) + } + ) + + phylo <- if_else(phylo, "TRUE", "FALSE") + + # Clean FASTA headers + fasta <- Biostrings::readAAStringSet(filepath = path) + headers_original <- names(fasta) + headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x)) + fasta <- cleanFAHeaders(fasta) + headers_cleaned <- names(fasta) + + # Write a table to map original accnums to their cleaned version + readr::write_tsv( + x = tibble::tibble( + "header_original" = headers_original, + "header_accnum" = headers_accnum, + "header_clean" = headers_cleaned + ), + file = file.path(dir, "query-fasta_header-map.tsv") + ) + + # Remove '*' characters which are incompatible with interproscan + fasta <- gsub(pattern = "\\*", replacement = "X", + x = fasta) |> Biostrings::AAStringSet() + Biostrings::writeXStringSet(x = fasta, filepath = path) + + if (domain_split) { + submit_split_by_domain( + dir = dir, + sequences = path, + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + phylo = phylo, + type = type, + job_code = pin_id, + ) + } else { + runFull( + dir = dir, + sequences = path, + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + phylo = phylo, + type = type, + job_code = pin_id, + APPLICATION = query_iprDatabases + ) + } + + # Additional handling for different upload types + if (upload_type == "BLAST Output") { + dir <- paste0(OUT_PATH, pin_id, "_blast") + if (blast_upload_data@df == "") { + stop("Error: Please upload a BLAST file.") + } + if (blast_upload_data@seqs == "" && !blast_ncbi_check) { + stop("Error: Please provide a file containing sequences or check + the box to use fetch sequences for NCBI accession numbers.") + } + + if (tools::file_ext(blast_upload_data@df) == "tsv" || blast_upload_data@df == EX_BLASTOUTPUT) { + data <- read_tsv(blast_upload_data@df, col_names = web_blastp_hit_colnames) + } else { + data <- read_csv(blast_upload_data@df, col_names = web_blastp_hit_colnames) + } + + if (nrow(data) > 5005) { + stop("Error: BLAST output submissions are limited to 5000 proteins + at the moment. For analyses with more than 5000 proteins please + contact janani.ravi@cuanschutz.edu.") + } + + system(paste0("mkdir ", dir), wait = TRUE) + blast_path <- paste0(dir, "/", pin_id, ".wblast.tsv") + write_tsv(data, blast_path, col_names = FALSE) + + queries <- data$Query %>% unique() + writeLines(queries, paste0(dir, "/", "accs.txt")) + + if (blast_ncbi_check) { + submit_blast( + dir = dir, + blast = paste0(dir, "/", pin_id, ".wblast.tsv"), + seqs = paste0(dir, "/", "seqs.fa"), + ncbi = TRUE, + job_code = pin_id, + submitter_email = notify_email, + advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist()) + ) + } else { + seqs <- read_file(blast_upload_data@seqs) + writeLines(seqs, paste0(dir, "/", "seqs.fa")) + submit_blast( + dir = dir, + blast = paste0(dir, "/", pin_id, ".wblast.tsv"), + seqs = paste0(dir, "/", "seqs.fa"), + ncbi = FALSE, + job_code = pin_id, + submitter_email = notify_email, + advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist()) + ) + } + + } else if (upload_type == "InterProScan Output") { + dir <- paste0(OUT_PATH, pin_id, "_ipr") + path <- paste0(dir, "/", pin_id, "_ipr.tsv") + if (ipr_upload_data()@df == "") { + stop("Error: Please upload an interproscan file.") + } + if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) { + stop("Error: Please provide a file containing sequences or check the + box to fetch sequences using NCBI accession numbers.") + } + + ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE) + if (nrow(ipr) > 200) { + stop("Error: Only submissions with less than 200 proteins are + accepted at this time. For analyses with more than 200 proteins + please contact janani.ravi@cuanschutz.edu.") + } + + system(paste0("mkdir ", dir), wait = TRUE) + writeLines("AccNum\tSeqMD5Digest\tSLength\tAnalysis\tDB.ID\tSignDesc\tStartLoc\tStopLoc\tScore\tStatus\tRunDate\tIPRAcc\tIPRDesc\tGOTerms\tExtra\n", path) + write_tsv(ipr, path, col_names = FALSE, append = TRUE) + + ncbi <- if_else(ipr_ncbi_check, TRUE, FALSE) + blast <- if_else(acc_homology_analysis, TRUE, FALSE) + + if (ncbi) { + submit_ipr( + dir = dir, + ipr = path, + blast = acc_homology_analysis, + seqs = paste0(dir, "/", "seqs.fa"), + ncbi = TRUE, + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + ) + } else { + seqs <- read_file(ipr_upload_data()@seqs) + writeLines(seqs, paste0(dir, "/", "seqs.fa")) + submit_ipr( + dir = dir, + ncbi = FALSE, + ipr = path, + blast = acc_homology_analysis, + seqs = paste0(dir, "/", "seqs.fa"), + DB = blast_db, + NHITS = blast_hits, + EVAL = blast_eval, + + ) + } + + } + + # Process results for results + fetched <- processWrapperDir(dir, pinName = pinName, type = type) + + # Assign fetched data to objects if the fetched list has the expected length + if (length(fetched) == 2) { + data <- fetched[[1]] # Assign the first element to 'data' + app_data <- fetched[[1]] # Assign the same to 'app_data' + query_data <- fetched[[2]] # Assign the second element to 'query_data' + + r_nrow_initial <- nrow(fetched[[1]]@df) # Initialize row count + } + + domarch_cols_value <- getDomArchCols(app_data, DASelect) + + query_domarch_cols_value <- getQueryDomArchCols(query_data@df) + + mainTable_value <- getDataTable(data) + + queryDataTable_value <- getQueryDataTable(query_data, query_select) + + fastaDataText_value <- getFastaData(query_data@fasta_path) + + domainDataText_value <- getDomData(data) + + msaDataText_value <- getMSAData(query_data@msa_path) + + rs_IprGenes_value <- getIPRGenesVisualization(data, + app_data, + query_iprDatabases, + query_iprVisType) + + rs_network_layout_value <- getRSNetworkLayout(data, + app_data, + cutoff = 100, + layout = "nice") + + rs_data_table_value <- getDataTable(data) + + da_IprGenes_value <- getDomArchIPRGenesPlot(app_data, + query_iprDatabases, + query_iprVisType, + DASelect) + + query_heatmap_value <- getQueryHeatmap(query_data@df, + heatmap_select = "All", + heatmap_color = "blue") + + DA_Prot_value <- getDomArchProt(app_data, DASelect) + + DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.PfamA", + DACutoff, + DA_Prot_value, + DA_lin_color = "viridis", + app_data@ipr_path) + + DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value, + DACutoff, + DA_col = "DomArch.PfamA", + app_data) + + DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.PfamA", + app_data@ipr_path, + DALin_TotalCounts_value) + + DANetwork_value <- getDomNetwork(DA_col = "DomArch.PfamA", + DACutoff, + DA_Prot_value, + networkLayout = "nice", + app_data@ipr_path) + + rep_accnums_value <- repAccNums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) + + phylogeny_prot_value <- filterPhylogenyProteins(app_data, phylo_select) + + acc_to_name_value <- acc2Name(app_data) + + ####### Report Generation ######## + + tryCatch({ + tempReport <- file.path(dir, "report.Rmd") + file.copy(report_template_path, tempReport, overwrite = TRUE) + + # List of graphics to include in report + params <- list( + rs_interproscan_visualization = rs_IprGenes_value, + proximity_network = rs_network_layout_value, + sunburst = data@df, + data = rs_data_table_value, + queryDataTable = queryDataTable_value, + fastaDataText = fastaDataText_value, + heatmap = query_heatmap_value, + query_data = query_data, + query_domarch_cols = query_domarch_cols_value, + query_iprDatabases = query_iprDatabases, + query_iprVisType = query_iprVisType, + mainTable = mainTable_value, + DALinTable = DALinTable_value, + DALinPlot = DALinPlot_value, + DANetwork = DANetwork_value, + DA_Prot = DA_Prot_value, + domarch_cols = domarch_cols_value, + DA_Col = DA_Col, + DACutoff = DACutoff, + da_interproscan_visualization = da_IprGenes_value, + phylo_sunburst_levels = levels, + phylo_sunburst = phylogeny_prot_value, + tree_msa_tool = tree_msa_tool, + repAccNums = rep_accnums_value, + msa_rep_num = 3, + app_data = app_data, + PhyloSelect = PhyloSelect, + acc2Name = acc_to_name_value, + rval_phylo = rval_phylo, + msa_reduce_by = msa_reduce_by + ) + + # Render RMarkdown report + rmarkdown::render(tempReport, + output_file = output_file, + params = params, + envir = new.env(parent = globalenv())) + browseURL(output_file) + }, error = function(e) { + return(paste("Error in report generation:", e$message)) + }) + + return("Initialization, upload, and + report generation completed successfully.") +} + diff --git a/R/ipr2viz.R b/R/ipr2viz.R index e582ab09..51650761 100644 --- a/R/ipr2viz.R +++ b/R/ipr2viz.R @@ -79,7 +79,7 @@ themeGenes2 <- function() { #' n = 20, query = "specific_query_name") #' } getTopAccByLinDomArch <- function(infile_full, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", lin_col = "Lineage_short", n = 20, query) { @@ -94,14 +94,14 @@ getTopAccByLinDomArch <- function(infile_full, cln_domarch <- cln %>% select(domarch_cols) col_counts <- colSums(is.na(cln_domarch)) DA_sym <- sym(names(which.min(col_counts))) - showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations")) + # showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations")) ## Group by Lineage, DomArch and reverse sort by group counts grouped <- cln %>% group_by({{ DA_sym }}, {{ lin_sym }}) %>% arrange(desc(PcPositive)) %>% summarise(count = n(), AccNum = dplyr::first(AccNum)) %>% arrange(-count) %>% - filter({{ lin_sym }} != "" && {{ DA_sym }} != "") + filter({{ lin_sym }} != "" & {{ DA_sym }} != "") top_acc <- grouped$AccNum[1:n] top_acc <- na.omit(top_acc) return(top_acc) @@ -180,14 +180,14 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = MolEvolvR::iprscan_cols) ipr_out <- ipr_out %>% filter(.data$Name %in% accessions) analysis_cols <- paste0("DomArch.", analysis) - infile_full <- infile_full %>% select(.data$analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum) + infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum) ## To filter by Analysis analysis <- paste(analysis, collapse = "|") ## @SAM: This can't be set in stone since the analysis may change! ## Getting top n accession numbers using getTopAccByLinDomArch() top_acc <- getTopAccByLinDomArch( infile_full = infile_full, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", ## @SAM, you could pick by the Analysis w/ max rows! lin_col = "Lineage_short", n = topn, query = query @@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(), analysis_labeler <- analyses %>% pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis) - lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv" + lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc @@ -360,7 +360,7 @@ plotIPR2VizWeb <- function(infile_ipr, ## @SAM, colnames, merges, everything neeeds to be done now based on the ## combined lookup table from "common_data" - lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv" + lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE) lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols) ## Read IPR file and subset by Accessions diff --git a/R/networks_domarch.R b/R/networks_domarch.R index ae6fe8be..b171b3a8 100755 --- a/R/networks_domarch.R +++ b/R/networks_domarch.R @@ -24,20 +24,20 @@ #' A network of domains is returned based on shared domain architectures. #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Character. Color to represent the queried domain in the +#' @param query_color Character. Color to represent the queried domain in the #' network. #' #' @importFrom dplyr across add_row all_of distinct filter mutate pull select -#' @importFrom igraph delete_vertices graph_from_edgelist vertex +#' @importFrom igraph delete_vertices graph_from_edgelist vertex V #' @importFrom purrr map #' @importFrom rlang sym #' @importFrom shiny showNotification @@ -174,21 +174,21 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c if ("X" %in% V(g)$name) { g <- delete_vertices(g, "X") } - V(g)$size <- as.numeric(wc[V(g)$name]) + igraph::V(g)$size <- as.numeric(wc[igraph::V(g)$name]) - V(g)$size <- (V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 20 + 10 # scaled by degree + igraph::V(g)$size <- (igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 20 + 10 # scaled by degree # setting vertex color by size - V(g)$color <- rainbow(5, alpha = .5)[round((V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 4 + 1)] - V(g)$frame.color <- V(g)$color + igraph::V(g)$color <- rainbow(5, alpha = .5)[round((igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 4 + 1)] + igraph::V(g)$frame.color <- igraph::V(g)$color # scaling edge width - E(g)$width <- e.sz - E(g)$width <- ifelse(log(E(g)$width) == 0, .3, log(E(g)$width)) + igraph::E(g)$width <- e.sz + igraph::E(g)$width <- ifelse(log(igraph::E(g)$width) == 0, .3, log(igraph::E(g)$width)) # coloring edges by width ew <- c(2.7, 4.5) - E(g)$color <- sapply( - E(g)$width, - function(x) if (x >= ew[1] && x <= ew[2]) E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) E(g)$color <- adjustcolor("maroon", alpha.f = .5) else E(g)$color <- "gray55" + igraph::E(g)$color <- sapply( + igraph::E(g)$width, + function(x) if (x >= ew[1] && x <= ew[2]) igraph::E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) igraph::E(g)$color <- adjustcolor("maroon", alpha.f = .5) else igraph::E(g)$color <- "gray55" ) vis_g <- visIgraph(g, type = "full") } @@ -211,7 +211,7 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c visOptions(highlightNearest = TRUE) }, error = function(e) { - showNotification(toString(e)) + # showNotification(toString(e)) vis_g <- "error" }, finally = { @@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c #' #' #' @param prot A data frame that contains the column 'DomArch'. -#' @param column Name of column containing Domain architecture from which nodes +#' @param column Name of column containing Domain architecture from which nodes #' and edges are generated. #' @param domains_of_interest Character vector specifying the domains of interest. -#' @param cutoff Integer. Only use domains that occur at or above the cutoff for +#' @param cutoff Integer. Only use domains that occur at or above the cutoff for #' total counts if cutoff_type is "Total Count". -#' Only use domains that appear in cutoff or greater lineages if cutoff_type is +#' Only use domains that appear in cutoff or greater lineages if cutoff_type is #' Lineage. #' @param layout Character. Layout type to be used for the network. Options are: #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"} -#' @param query_color Color that the nodes of the domains in the +#' @param query_color Color that the nodes of the domains in the #' domains_of_interest vector are colored -#' @param partner_color Color that the nodes that are not part of the +#' @param partner_color Color that the nodes that are not part of the #' domains_of_interest vector are colored #' @param border_color Color for the borders of the nodes. #' @param IsDirected Is the network directed? Set to false to eliminate arrows diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R index 75cc375d..6e2e8fd7 100644 --- a/R/pre-msa-tree.R +++ b/R/pre-msa-tree.R @@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") { id = accessions_partitioned[[x]], db = "protein", rettype = "fasta", - api_key = Sys.getenv("ENTREZ_API_KEY") + # api_key = Sys.getenv("ENTREZ_API_KEY") ) ) }) @@ -809,9 +809,9 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) { abort("Error: The FASTA file does not exist: ", fasta_file) } - if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") { - abort("Error: The specified file is not a valid FASTA file: ", fasta_file) - } + # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") { + # abort("Error: The specified file is not a valid FASTA file: ", fasta_file) + # } fasta <- readAAStringSet(fasta_file) aligned <- switch(tool, @@ -869,11 +869,11 @@ writeMSA_AA2FA <- function(alignment, outpath) { abort("Error: The output directory does not exist: ", outdir) } - l <- length(rownames(alignment)) + l <- length(names(unmasked((alignment)))) fasta <- "" for (i in 1:l) { - fasta <- paste0(fasta, paste(">", rownames(alignment)[i]), "\n") + fasta <- paste0(fasta, paste(">", names(unmasked((alignment)))[i]), "\n") seq <- toString(unmasked(alignment)[[i]]) fasta <- paste0(fasta, seq, "\n") } @@ -955,4 +955,4 @@ getAccNumFromFA <- function(fasta_file) { # cfile <- read_delim("data/alignments/pspc.gismo.aln", delim=" ") # cfile <- as.data.frame(map(cfile,function(x) gsub("\\s+", "",x))) # colnames(cfile) <- c("AccNum", "Alignment") -# } +# } \ No newline at end of file diff --git a/R/run_pipeline.R b/R/run_pipeline.R new file mode 100644 index 00000000..fe1fa4c9 --- /dev/null +++ b/R/run_pipeline.R @@ -0,0 +1,1343 @@ +# Author(s): Awa Synthia +# Last modified: 2024 + +# import libs +#' @import readr data.table httr rentrez iprscanr + +getSeqs <- function(sequences, + acc_file_path = "accs.txt", + dir_path = "~", + separate = TRUE) { + seqs <- readAAStringSet(sequences) + cln_names <- c() + for (accnum in names(seqs)) { + if (grepl("\\|", accnum)) { + accnum_cln <- strsplit(accnum, "\\|")[[1]][2] + accnum_cln <- strsplit(accnum_cln, " ")[[1]] + } else { + accnum_cln <- strsplit(accnum, " ")[[1]][1] + } + cln_names <- append(cln_names, accnum_cln) + write(accnum_cln, file = acc_file_path, append = TRUE) + if (separate) { + write(paste0(dir_path, "/", accnum_cln, ".faa"), + file = "input.txt", append = TRUE) + write(paste0(">", accnum_cln), + file = paste0(accnum_cln, ".faa"), append = TRUE) + write(toString(seqs[accnum]), + file = paste0(accnum_cln, ".faa"), append = TRUE) + } + } + names(seqs) <- cln_names + writeXStringSet(seqs, sequences, format = "fasta") + return(length(seqs)) +} +runFull <- function( + dir = "/data/scratch", + DB = Sys.getenv("BLAST_DB", unset = "refseq"), + NHITS = Sys.getenv("BLAST_HITS", unset = 100), + EVAL = Sys.getenv("BLAST_EVALUE", unset = 0.00001), + sequences = "~/test.fa", + phylo = "FALSE", + by_domain = "FALSE", + domain_starting = "~/domain_seqs.fa", + type = "full", + job_code=NULL, + submitter_email=NULL, + advanced_options=NULL, + get_slurm_mails=FALSE, + APPLICATION +) { + # Set working directory + setwd(dir) + + advanced_options_names <- names(advanced_options[advanced_options == TRUE]) + + # Write job submission params to file + job_args <- list( + submission_type = type, + database = ifelse(phylo == FALSE, DB, NA), + nhits = ifelse(phylo == FALSE, NHITS, NA), + evalue = ifelse(phylo == FALSE, EVAL, NA), + submitter_email = submitter_email, + advanced_options = advanced_options_names, + job_code = job_code + ) + yml <- yaml::as.yaml(job_args) + write(yml, "job_args.yml") + + # Create a log file + write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa + \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2DomArch\tduration", + "logfile.tsv") + + # Process sequences (local handling) + if (phylo == "FALSE") { + # Split the sequences if needed, store them locally + num_seqs <- getSeqs(sequences, dir_path = dir, separate = TRUE) + + fasta <- Biostrings::readAAStringSet(sequences) + headers_original <- names(fasta) + headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x)) + + # Execute BLAST locally (instead of submitting jobs to the cluster) + for (i in 1:num_seqs) { + # Assume each sequence is saved separately + input_file <- paste0(dir, "/", headers_accnum[i], ".faa") + # output_file <- paste0(dir, "/blast_output_", i, ".txt") + + # Construct the local BLAST command (make sure 'blastn' is available locally) + runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i, APPLICATION = APPLICATION) + + #cmd <- sprintf( + # "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote", + # input_file, DB, output_file, NHITS, EVAL + #) + + # Execute BLAST locally + #system(cmd) + + cat(sprintf("BLAST for sequence %d completed.\n", i), file=stderr()) + } + } else { + # Handle phylogenetic analysis if needed + cat("Phylogenetic analysis is not supported in the local version yet.\n") + } + + # Simulate query run locally + runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type, APPLICATION = APPLICATION) + # cmd_query <- sprintf( + # "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote", + # sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL + # ) + # system(cmd_query) + + cat("Query analysis completed.\n") + + # Status update + num_runs <- num_seqs + 1 + write(paste0("0/", num_runs, " analyses completed"), "status.txt") + + cat("All analyses completed locally.\n") +} + +# Define the main pipeline function +runMolevolvrPipeline <- function(input_paths, db, nhits, eval, + is_query, type, i, APPLICATION) { + + # Start time + start <- Sys.time() + OUTPATH <- getwd() # Set output directory to the current working directory + + # If IS_QUERY is True, handle query data + if (is_query == TRUE) { + + FILE <- input_paths + PREFIX <- "query_data" + OUTDIR <- file.path(OUTPATH, PREFIX) + dir.create(OUTDIR, showWarnings = FALSE) # Create output directory + + all_accnums_file <- file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")) + file.copy(FILE, all_accnums_file) + + # Parse accession numbers + accnums_file <- file.path(OUTPATH, "query-fasta_header-map.tsv") + parsed_accnums_file <- file.path(OUTDIR, "parsed_accnums.txt") + + if (file.exists(accnums_file)) { + parsed_accnums <- read.table(accnums_file, + header = FALSE, sep = "\t", + stringsAsFactors = FALSE) + writeLines(parsed_accnums$V2[-1], parsed_accnums_file) # Skip header + } + + # Copy starting_accs.txt if exists, or fallback to accs.txt + if (file.exists("starting_accs.txt")) { + file.copy("starting_accs.txt", + file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt"))) + } else { + file.copy("accs.txt", + file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt"))) + file.copy(parsed_accnums_file, + file.path(OUTDIR, paste0(PREFIX, ".parsed_accnums.txt"))) + } + + # setwd(OUTDIR) + + # Run acc2info + runAcc2Info(parsed_accnums_file, PREFIX, OUTDIR) + + replaceAccNums(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + file.path(OUTPATH, "query-fasta_header-map.tsv"), + file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv"))) + + file.copy(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv"))) + + } else { + # Handle homolog data + FILE <- readLines(input_paths)[1] # Assume single file for local usage + F_value <- basename(FILE) + PREFIX <- sub("\\.faa$", "", basename(FILE)) + PREFIX <- gsub(">", "", PREFIX) # Extract prefix from file name + OUTDIR <- file.path(OUTPATH, paste0(PREFIX)) + dir.create(OUTDIR, showWarnings = FALSE) # Create output directory + # setwd(OUTDIR) + + # Run DELTABLAST + runDELTABLAST(input_paths, PREFIX, OUTDIR, db, nhits, eval) + + # Run ACC2FA + convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), + PREFIX, OUTDIR) + + # Run ACC2INFO + runAcc2Info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")), + PREFIX, OUTDIR) + + # Clean up BLAST results + cleanupBlast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), + PREFIX, F) + + } + + # Sys.sleep(30) + + # Run BLASTCLUST + runCDHIT(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + PREFIX, OUTDIR ) + + # Convert clusters to table + clust2Table(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv"))) + + # Run INTERPROSCAN + runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")), + prefix = PREFIX, outdir = OUTDIR, appl = APPLICATION) + + # Run IPR2LIN + ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX) + + + # Optionally run ipr2DomArch if IS_QUERY is true + if (is_query == TRUE) { + ## perform ipr2DomArch on iprscan results + da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), + PREFIX, "NA") + + ## if blast results are provided, call appendIPR + if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) | + is.na(PREFIX)) { + print("No blast results provided, moving on.") + } else { + file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")), + file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv"))) + } + } else { + # perform ipr2DomArch on iprscan results + da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")), + PREFIX) + + ## if blast results are provided, call appendIPR + if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) | + is.na(PREFIX)) { + print("No blast results provided, moving on.") + } else { + appendIPR(ipr_da = da, + blast = file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")), + prefix = PREFIX) + } + } + + # Copy the input fasta file to the output directory + # file.copy(input_paths, OUTDIR) + + # Total run time + dur <- difftime(Sys.time(), start, units = "secs") + cat("\nTotal run time:", dur, "seconds\n") + + # Log the run times + logfile_path <- file.path(OUTPATH, "logfile.tsv") + log_data <- data.frame( + START_DT = format(start, "%d/%m/%Y-%H:%M:%S"), + STOP_DT = format(Sys.time(), "%d/%m/%Y-%H:%M:%S"), + PREFIX = PREFIX, + dur = dur + ) + + write.table(log_data, file = logfile_path, sep = "\t", + row.names = FALSE, col.names = FALSE, + append = TRUE, quote = FALSE) +} + +# Define the acc2info function +acc2info <- function(infile, prefix, outdir) { + # Ensure output directory exists + if (!dir.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + } + + outfile <- file.path(outdir, paste0(prefix, ".acc2info.tsv")) + + # Print column names to the output file + # Print column names to outfile + + # Read the input file of accession numbers + acc_nums <- readLines(infile) + + # Random sleep to avoid overloading the server + # Sys.sleep(sample(1:10, 1)) + + # EPOST to get WebEnv and QueryKey + epost_response <- entrez_post(db = "protein", id = acc_nums) + webenv <- epost_response$WebEnv + query_key <- epost_response$QueryKey + # EFETCH to get the document summaries + docsums <- entrez_summary(db = "protein", web_history = epost_response) + + # Check if any atomic values exist in docsums + any_atomic <- FALSE + # Check if any value in the current docsum is atomic + for (docsum in docsums) { + # Check if any value in the current docsum is atomic + if (is.atomic(docsum)) { + any_atomic <- TRUE + break # Exit loop early if an atomic value is found + } + } + + if (any_atomic) { + parsed_data <- data.frame( + AccNum = docsums$oslt$value, + AccNum.noV = docsums$caption, + FullAccNum = docsums$extra, + Description = docsums$title, + Length = docsums$slen, + TaxID = docsums$taxid, + Species = docsums$organism, + SourceDB = ifelse(is.null(docsums$sourcedb), NA, + docsums$sourcedb), + Completeness = ifelse(is.null(docsums$completeness), NA, + docsums$completeness), + stringsAsFactors = FALSE # Avoid factors in data frame + ) + } else { + # If no atomic values exist, use lapply to parse each docsum + parsed_data <- do.call(rbind, lapply(docsums, function(docsum) { + tryCatch({ + data.frame( + AccNum = docsum$oslt$value, + AccNum.noV = docsum$caption, + FullAccNum = docsum$extra, + Description = docsum$title, + Length = docsum$slen, + TaxID = docsum$taxid, + Species = docsum$organism, + SourceDB = ifelse(is.null(docsum$sourcedb), NA, + docsum$sourcedb), + Completeness = ifelse(is.null(docsum$completeness), NA, + docsum$completeness), + stringsAsFactors = FALSE # Avoid factors in data frame + ) + }, error = function(e) { + # Return a data frame filled with NA values in case of an error + return(data.frame( + AccNum = NA, + AccNum.noV = NA, + FullAccNum = NA, + Description = NA, + Length = NA, + TaxID = NA, + Species = NA, + SourceDB = NA, + Completeness = NA, + stringsAsFactors = FALSE + )) + }) + })) + } + + # Parse the fetched data + # Check if we got any data + if (nrow(parsed_data) == 0) { + cat("No data found for accession numbers in NCBI. Trying UniProt...\n") + + # Split input file into smaller chunks + split_files <- split(acc_nums, ceiling(seq_along(acc_nums) / 100)) + + for (i in seq_along(split_files)) { + accnum <- paste(split_files[[i]], collapse = ",") # Join with comma + url <- paste0("https://www.ebi.ac.uk/proteins/api/proteins?accession=", + accnum) + + response <- GET(url, accept("application/xml")) + + if (status_code(response) == 200) { + xml_content <- content(response, as = "text") + + # Parse the XML response + parsed_uniprot <- xml2::read_xml(xml_content) + + # Extract required elements (customize based on XML structure) + entries <- xml2::xml_find_all(parsed_uniprot, "//entry") + + for (entry in entries) { + accession <- xml2::xml_text(xml2::xml_find_first(entry, + ".//accession")) + full_name <- xml2::xml_text(xml2::xml_find_first(entry, + ".//fullName")) + length_seq <- xml2::xml_attr(xml2::xml_find_first(entry, + ".//sequence"), + "length") + db_reference <- xml2::xml_attr(xml2::xml_find_first(entry, + ".//dbReference"), + "id") + dataset <- xml2::xml_attr(entry, "dataset") + name <- xml2::xml_text(xml2::xml_find_first(entry, ".//name")) + + # Create a row of data + new_row <- data.frame(AccNum = accession, + AccNum.noV = gsub("\\|", "", accession), + FullAccNum = accession, + Description = full_name, + Length = as.integer(length_seq), + TaxID = db_reference, + Species = name, + SourceDB = dataset, + Completeness = "NA", # Adjust as needed + stringsAsFactors = FALSE) + + # Append to the outfile + write.table(new_row, file = outfile, sep = "\t", col.names = TRUE, + row.names = FALSE, quote = FALSE, append = TRUE) + } + } + } + } else { + # Append NCBI data to outfile + write.table(parsed_data, file = outfile, sep = "\t", col.names = TRUE, + row.names = FALSE, quote = FALSE, append = TRUE) + } + + cat("Data saved to:", outfile, "\n") +} + +acc2InfoPhylo <- function(infile, outdir) { + # Ensure output directory exists + if (!dir.exists(outdir)) { + dir.create(outdir, recursive = TRUE) + } + + outfile <- file.path(outdir, "acc2info.tsv") + + # Create a data frame to store results + results <- data.frame( + Caption = character(), + Extra = character(), + Title = character(), + Length = character(), + TaxID = character(), + Organism = character(), + SourceDB = character(), + Completeness = character(), + stringsAsFactors = FALSE + ) + + # Process accession numbers in batches to avoid overloading + for (acc in acc_nums) { + # Fetch data using epost and efetch + response <- GET(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&format=xml&id=%s", + acc)) + + # Check if the request was successful + if (response$status_code == 200) { + # Parse the XML response + parsed_xml <- read_xml(content(response, "text")) + + # Extract relevant information + doc_summary <- xml_find_all(parsed_xml, ".//DocumentSummary") + + for (summary in doc_summary) { + results <- rbind(results, data.frame( + Caption = xml_text(xml_find_first(summary, ".//Caption")), + Extra = xml_text(xml_find_first(summary, ".//Extra")), + Title = xml_text(xml_find_first(summary, ".//Title")), + Length = xml_text(xml_find_first(summary, ".//Slen")), + TaxID = xml_text(xml_find_first(summary, ".//TaxId")), + Organism = xml_text(xml_find_first(summary, ".//Organism")), + SourceDB = xml_text(xml_find_first(summary, ".//SourceDb")), + Completeness = xml_text(xml_find_first(summary, ".//Completeness")), + stringsAsFactors = FALSE + )) + } + } else { + warning(sprintf("Failed to fetch data for accession %s: %s", + acc, response$status_code)) + } + } + + # Write results to the output file + write.table(results, file = outfile, sep = "\t", row.names = FALSE, + quote = FALSE) +} + +# Main function to run based on the prefix +runAcc2Info <- function(infile, prefix, outdir) { + if (prefix == "NA") { + acc2InfoPhylo(infile, outdir) + } else { + acc2info(infile, prefix, outdir) + } +} + +subsAccnum4cc2Info <- function(df_acc2info, df_header_map) { + df_result <- df_header_map |> + # set column name in header map to match accnum col in acc2info + dplyr::rename(AccNum = header_accnum) |> + # join onto header map + dplyr::left_join(df_acc2info, by = "AccNum") |> + # deselect accnum + dplyr::select(-AccNum) |> + # set the accnum col to the cleaned form + dplyr::rename(AccNum = header_clean) |> + # rm excess columns from header map file + dplyr::select(-header_original) + return(df_result) +} + +replaceAccNums <- function(path_acc2info, + path_query_header_map, path_out) { + + # Read the input files + df_acc2info <- read_tsv(path_acc2info) + df_query_header_map <- read_tsv(path_query_header_map) + + # Substitute accession numbers + df_acc2info_substituted <- subsAccnum4cc2Info(df_acc2info, + df_query_header_map) + + # Print the substituted dataframe + print("### df_acc2info_substituted") + print(df_acc2info_substituted) + + # Write the substituted dataframe to the output file + write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE) +} + +runDELTABLAST <- function(infile, prefix, outdir, + db = "refseq_protein", + nhits = 5000, evalue = 1e-5, + threads = 10) { + + # Prepare output file path + # outfile <- file.path(outdir, paste0(prefix, ".dblast.tsv")) + + outfile <- paste0(outdir, "/" , prefix, ".dblast.tsv") + + # Print I/O messages + cat("\nNow processing:", infile, "\n") + cat("Running against:", db, "\n") + cat("E-value: ≤", evalue, "\n") + cat("Top", nhits, "hits/alignments\n") + cat("Output filepath:", outfile, "\n") + + # Designating database based on user input + dblast_db <- switch(db, + nr = "nr", + refseq_protein = "refseq_protein", + stop("Invalid database specified. + Choose either 'nr' or 'refseq'.")) + + # Core command for running DELTABLAST + command <- sprintf( + "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %s -outfmt '6 qacc sacc sseqid sallseqid stitle sscinames staxids pident length mismatch gapopen qstart qend qlen sstart send slen evalue bitscore ppos' -remote", + infile, dblast_db, outfile, nhits, evalue + ) + # Run the DELTABLAST command + system(command, intern = TRUE) + + cat("DELTABLAST completed.\n") +} + +# This script converts AccNum to Fasta using NCBI's EDirect or EBI's API +convertAccNum2Fasta <- function(infile, prefix, outdir) { + + # Create the output file path + outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa")) + + # Print status + cat("\n####################\n") + cat("BEGIN EDIRECT SEARCH\n") + cat("####################\n") + cat("Processing input file:", infile, "\n") + + # Create temporary files based on the number of columns in input file + temp_acc_file <- file.path(outdir, paste0(prefix, ".all_accnums.txt")) + cols <- length(strsplit(readLines(infile, n = 1), "\t")[[1]]) + + if (cols > 1) { + # Extract unique homolog accession numbers from the 2nd column + cat("Creating temp file with 2nd column unique accessions...\n") + system(sprintf("awk -F '\\t' '{ print $2 }' %s | sort -u > %s", infile, temp_acc_file)) + } else { + # Extract unique homolog accession numbers from the 1st column + cat("Creating temp file with 1st column unique accessions...\n") + system(sprintf("awk -F '\\t' '{ print $1 }' %s | sort -u > %s", infile, temp_acc_file)) + } + + # Split accessions into chunks of 1000 + system(sprintf("split -l 1000 -e %s %s/acc", temp_acc_file, outdir)) + + # Fetch FASTA sequences + cat("\nObtaining FASTA files\n") + acc_files <- list.files(path = outdir, pattern = "^acc", full.names = TRUE) + + for (x in acc_files) { + # Accession numbers in a comma-separated format + accnum <- paste(readLines(x), collapse = ",") + fasta_sequence <- entrez_fetch(db = "protein", id = accnum, + rettype = "fasta") + # Write the fetched sequence to the output file + writeLines(fasta_sequence, con = outfile) + } + + # Check if any sequences were retrieved + num_seqs <- as.numeric(system(sprintf("grep '>' %s | wc -l", outfile), + intern = TRUE)) + + # If no sequences found, try fetching from EBI API + if (num_seqs < 1) { + cat("\nNo sequences retrieved. Trying EBI API...\n") + for (x in acc_files) { + accnum <- paste(readLines(x), collapse = ",") + system(sprintf( + "curl -X GET --header 'Accept:text/x-fasta' 'https://www.ebi.ac.uk/proteins/api/proteins?accession=%s' >> %s", + accnum, outfile + )) + + } + } + + # Remove temporary files + cat("\nRemoving temporary files\n") + system(sprintf("rm %s/acc*", outdir)) + + # Print end message + cat("#####################\n") + cat("END OF EDIRECT SEARCH\n") + cat("#####################\n") +} + +cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) { + + outdir <- dirname(infile_blast) + # Load and clean acc2info file + acc2info_out <- fread(input = acc2info, sep = "\t", + header = TRUE, fill = TRUE) %>% + mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>% + mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum)) + + # If using web-blast output + if (wblast == TRUE) { + blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE, + col.names = web_blastp_hit_colnames, fill = TRUE) + cleanedup_blast <- blast_out %>% + mutate(AccNum = gsub("\\|", "", AccNum)) %>% + mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>% + mutate(PcIdentity = round(as.double(PcIdentity), 2)) + + # Merge cleaned blast output with acc2info + cleanedup_blast <- merge(cleanedup_blast, acc2info_out, + by.x = "AccNum", + by.y = "FullAccNum", + all.x = TRUE) + names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species" + + # Additional calculations for PcPositive + cleanedup_blast <- cleanedup_blast %>% + mutate(PcPosOrig = as.numeric(PcPosOrig)) %>% + mutate(AlnLength = as.numeric(AlnLength)) %>% + mutate(PcPositive = PcPosOrig) + + } else if (wblast == FALSE) { + # If using classic-blast output + blast_out <- read_tsv(file = infile_blast, col_names = cl_blast_colnames) + cleanedup_blast <- blast_out %>% + mutate(AccNum = gsub("\\|", "", AccNum)) %>% + mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>% + mutate(Species = gsub(";.*$", "", Species)) %>% + mutate(PcIdentity = round(PcIdentity, 2)) %>% + mutate(PcPositive = round((PcPosOrig * AlnLength / QLength), digits = 2)) + + # Merge cleaned blast output with acc2info + cleanedup_blast <- merge(cleanedup_blast, acc2info_out, + by.x = "AccNum", + by.y = "FullAccNum", + all.x = TRUE) %>% + select(-Species.x, -TaxID.x) + names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species" + } + + # TaxID to lineage mapping + cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID) + lineage_map <- fread( + system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + header = TRUE, fill = TRUE, colClasses = lineage_map_cols) + + # Merge with lineage map and clean up columns + mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID", + all.x = TRUE) %>% + mutate(Species = Species.y, Spp.blast = Species.x) %>% + select(any_of(cl_blast_postcln_cols)) + + # Add names and prepare the output + blast_names <- addName(mergedLins) + + # Create output file name + file_name <- file.path(outdir, paste0(prefix, ".blast.cln.tsv")) + + # Write cleaned data to file + write_tsv(blast_names, file_name, col_names = TRUE) +} + +# Function to run BLASTCLUST on given input +runCDHIT <- function(infile, suffix, outdir) { + + # Prepare output file path + outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv")) + input_file <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv.clstr")) + + + # Print process messages + cat("\n#####################################\n") + cat("## Now running BLASTCLUST on file(s):", infile, "\n") + cat("#####################################\n") + + # Cluster sequences w/ BLASTCLUST/CD-HIT + # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8") + cdhit_command <- sprintf( + "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8", + infile, outfile + ) + # clean_cdhit_format <- sprintf( + # "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s", + # input_file, + # outfile + # ) + + cat("\nPerforming BLASTCLUST analysis on", infile, "\n") + # system(blastclust_cmd) + + system(cdhit_command) + # system(clean_cdhit_format) + cleanCDHIT(input_file, outfile) + cat("Cleaning CDHIT results completed") +} + +# extract_sequences("input_file.tsv.clstr", "output_file.tsv") +cleanCDHIT <- function(input_file, output_file) { + # Read the input file line by line + lines <- readLines(input_file) + + # Initialize an empty vector to store extracted identifiers + extracted <- c() + + # Loop through lines and extract content between ">" and "..." + for (line in lines) { + # Only process lines containing valid sequences (skip cluster headers) + if (grepl(">", line) && grepl("\\.\\.\\.", line)) { + # Extract the sequence identifier between ">" and "..." + match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line)) + if (length(match) > 0) { + sequence <- gsub("^>", "", match) # Remove the leading ">" + sequence <- gsub("\\.\\.\\.$", "", sequence) # Remove trailing dots + extracted <- c(extracted, sequence) + } + } + } + + # Write the extracted sequences to the output file + writeLines(extracted, output_file) + + cat("Sequences extracted and written to:", output_file, "\n") +} + +# Function to format blastclust output +clust2Table <- function(clust, blast) { + clust_out <- read_tsv(file = clust, col_names = F) + blast_out <- read_tsv(file = blast, col_names = T) + ## Count the number of accession numbers in a cluster + # Counting number of spaces between acc. no. +1 + clust_out$NumAccs <- map(.x = clust_out$X1, function(x) { + (str_count(string = x, pattern = " ") + 1) + }) + ## Create empty vectors to store information + empty_vec <- c("ClusterID") + empty_vec2 <- c("RowNum") + ## Adding empty vectors to dataframe + clust_out[, empty_vec] <- NA + clust_out[, empty_vec2] <- NA + ## Counting number of rows to add to the RowNum column -- used for creating cluster name + rows <- as.numeric(rownames(clust_out)) %>% + str_pad(width = 4, pad = 0) + ## Add row number info to dataframe + clust_out[, "RowNum"] <- rows + # Name columns + colnam <- list("AccNum", "NoAccs", "ClusterID", "NumRows") + ## Create cluster name from row number (num of clusters) and num of accessions + clust_out <- clust_out %>% + `colnames<-`(colnam) %>% + mutate(ClusterID = paste0(NumRows, ".", NoAccs)) + + # Store data frame column as vector + myvar <- c("AccNum", "ClusterID") + ## Add only the 2 colummns wanted to a new varible + clusters <- clust_out[myvar] + # Initialize empty data frame + new_clust <- data.frame(ClusterID = character(0), + AccNum = character(0), + stringsAsFactors = F) + + ## Assigning each sseqid to a ClusterID + # Iterate over dataframe + for (i in 1:nrow(clusters)) { + # Extract the cluster name for this row + cname <- clusters$ClusterID[i] + # Split accession numbers by a space + # vals is a vecto of all accession numbers in a row + vals <- clusters$AccNum[i] %>% + strsplit(split = " ") %>% + unlist() + + # Iterate over each element in vals + for (v in vals) { + # add each accession num w/ corresponding cluster ID to new df + new_clust[nrow(new_clust) + 1, ] <- c(cname, v) + } + } + # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum) + # blast_out$AccNum <- paste0(">", blast_out$AccNum) + blast_clustnames <- merge(blast_out, new_clust, by = "AccNum") + + for (i in 1:nrow(blast_clustnames)) { + blast_clustnames$ClusterID[i] <- str_c(blast_clustnames$ClusterID[i]) + } + + first_prot <- as.data.frame(word(clust_out$AccNum), word(clust_out$ClusterID)) + + ## write the new file as a TSV file + newarg <- gsub(".bclust.L[0-9][0-9]S[0-9][0-9].tsv", "", clust) + # accnum + clusterID + write_tsv(new_clust, file = paste0(newarg, ".clustIDs"), append = F) + # first protein from every cluster + write_tsv(first_prot, file = paste0(newarg, ".clust_reps"), + col_names = F, append = F) + # cleaned up blast file + clusterIDs + write_tsv(blast_clustnames, file = paste0(newarg, ".cln.clust.tsv"), + col_names = T, append = F) +} + +# Function to run InterProScan +runIPRScan2 <- function(query_file, prefix, outdir, appl) { + + # Start InterProScan run + cat("\n######################\n") + cat("BEGIN INTERPROSCAN RUN\n") + cat("Input file:", query_file, "\n") + cat("######################\n") + + # Output file path + outfile <- file.path(outdir, paste0(prefix, ".iprscan")) + + # Process the input query file with InterProScan + cat("Now processing", query_file, "\n") + + # Run InterProScan command + # Construct the command + + # get the path to the interproscan.sh script from the environment + # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified + # iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh") + # + # command <- paste( + # iprscan_cmd, "-i", + # shQuote(query_file), + # "-b", shQuote(outfile), + # "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"), + # "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,", + # "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK" + # ) + + # Run the command + # system(command) + submit_ipr(path2seq = query_file, + outfolder = outdir, + appl = appl, + email = "jravilab.msu@gmail.com") + + new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID", + "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc") + + temp_data <- read_tsv(file.path(outdir, paste0("ipr_joined.tsv"))) + + # Drop the last two columns + temp_data <- temp_data[, -((ncol(temp_data) - 1):ncol(temp_data))] + + # remove last 10 xters in AccNum + temp_data$file_name <- substr(temp_data$file_name, 1, + nchar(temp_data$file_name) - 12) + + colnames(temp_data) <- new_header + + write.table(temp_data, file.path(outdir, paste0(prefix, ".iprscan.tsv")), + sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + + + cat("##################\n") + cat("END OF IPRSCAN RUN\n") + cat("##################\n") +} + +ipr2Linear <- function(ipr, acc2info, prefix) { + # read in iprscan results + # duplicate rows in iprscan file + ipr_in <- read_tsv(ipr, col_names = TRUE) %>% + mutate(DB.ID = gsub("G3DSA:", "", DB.ID)) + + if (prefix == "query_data") { + ipr_in$AccNum <- sub("(.*)([0-9])([0-9])$", "\\1.\\2_\\3", ipr_in$AccNum) + } + + acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>% + mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>% + mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum)) + + # merge ipr file with acc2info file + ipr_in <- ipr_in %>% + # remove version number and any other suffices + mutate(AccNum.noV = gsub("\\.[0-9].*", "", AccNum)) + + ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum") + + # read in lineage map + lineage_map <- fread( + system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE), + header = TRUE, fill = TRUE) + + # merge ipr+info w/ lineage + # both tables have a species column, but only + # the lineage_map (y) species column is kept + ipr_tax <- ipr_tax %>% + mutate(TaxID = as.numeric(TaxID)) + + ipr_lin <- left_join(ipr_tax, lineage_map, by = "TaxID") |> + mutate(Species = Species.y) %>% + select(-Species.x, -Species.y) + + # add lookup table to iprscan file + lookup_tbl <- fread( + system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE), + sep = "\t", header = TRUE, fill = TRUE) %>% + distinct() + if ("AccNum.x" %in% names(ipr_lin)) { + ipr_lin <- ipr_lin %>% + # deselect the AccNum.y from the lineage table (y) and set + # the AccNum.x (x) from the ipr/acc2info tables to simply 'AccNum' + mutate(AccNum = AccNum.x) %>% + select(-AccNum.x, -AccNum.y) + } + # run add_name f(x) on ipr+lineage dataframe + ipr_lin <- ipr_lin %>% + addName() %>% + mutate(Name = gsub("^_", "", Name)) + + # add domarch info to iprscan + lineage df, only keep what's in x + ipr_cln <- left_join(ipr_lin, lookup_tbl, by = "DB.ID") + + # populate empty description/short name columns + for (i in 1:nrow(ipr_cln)) { + if ((is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") && + (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-")) { + + ipr_cln$SignDesc[i] <- ipr_cln$IPRDesc[i] + if (length(ipr_cln$LookupTblDesc[i]) != 0) { + ipr_cln$ShortName[i] <- ipr_cln$LookupTblDesc[i] + } + } + if (is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") { + ipr_cln$ShortName[i] <- ipr_cln$SignDesc[i] + } + if (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-") { + ipr_cln$SignDesc[i] <- ipr_cln$ShortName[i] + } + } + # rename unclear/duplicated columns + names(ipr_cln)[names(ipr_cln) == "Description.x"] <- "ProteinName" + names(ipr_cln)[names(ipr_cln) == "Description.y"] <- "LookupTblDesc" + # deselect the AccNum.noV from the lineage table (y) and set + # the AccNum.noV.x (x) from the ipr/acc2info tables to simply 'AccNum.noV' + ipr_cln <- ipr_cln |> dplyr::select(-AccNum.noV.y) |> + dplyr::mutate(AccNum.noV = AccNum.noV.x) |> + dplyr::select(-AccNum.noV.x) + # create label column to use in ipr2viz + ipr_cln <- ipr_cln %>% + mutate(Label = strtrim(ShortName, 30)) %>% + mutate(Label = gsub(", .*", "", Label)) %>% + mutate(Label = gsub("C-terminal region of a signal", + "C-term signal peptide", Label)) %>% + mutate(Label = gsub("N-terminal region of a signal", + "N-term signal peptide", Label)) %>% + mutate(Label = gsub("Twin arginine translocation \\(T", + "Tat signal profile", Label)) %>% + mutate(Label = gsub("GLUTATHIONE HYDROLASE PROENZYM", + "GLUTATHIONE HYDROLASE PROENZYME", Label)) %>% + mutate(Label = gsub("N-terminal nucleophile aminohy", + "Ntn hydrolases", Label)) %>% + mutate(Label = gsub("Region of a membrane-bound pro", + "cytoplasmic reg of mem-bound prot", + Label)) %>% + mutate(ShortName = gsub("Region of a membrane-bound protein predicted to be + outside the membrane, in the cytoplasm.", + "cytoplasmic reg of mem-bound prot", ShortName)) + outdir <- dirname(ipr) + # write results to file + write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv")))) +} + +ipr2DomArch <- function(infile_ipr, prefix, + analysis = c( + "PfamA", "SMART", "Phobius", + "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE", + "SuperFamily", "MobiDBLite", "Panther", "Coils" + )) { + # read in cleaned up iprscan results + ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols) + + # split dataframe into unique proteins + x <- split(x = ipr_in, f = ipr_in$AccNum) + + # plan(strategy = "multicore", .skip = T) + + # within each data.table + domarch <- map(x, function(y) { + # domarch <- future_map(x, function(y) { + acc_row <- data.frame(AccNum = y$AccNum[1], stringsAsFactors = F) + DAs <- data.frame(matrix(nrow = 1, ncol = length(analysis))) + DA <- y %>% + group_by(Analysis) %>% + arrange(StartLoc) + i <- 1 + analysis_var <- c( + "Pfam", "SMART", "Phobius", + "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE", + "SUPERFAMILY", "MobiDBLite", "PANTHER", "Coils" + ) + for (a in analysis_var) { + a_da <- DA %>% filter(Analysis == a) + if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" || + a == "SignalP_GRAM_POSITIVE") { + var_shortname <- "DB.ID" + } else { + var_shortname <- "ShortName" + } + var_shortname_sym <- sym(var_shortname) + a_da <- a_da %>% + ungroup() %>% + select({{ var_shortname_sym }}) %>% + filter(!is.na({{ var_shortname_sym }})) %>% + filter(!is.null({{ var_shortname_sym }})) %>% + pull(var_shortname) %>% + paste(collapse = "+") + DAs[1, i] <- a_da + i <- (i + 1) + } + + colnames(DAs) <- paste("DomArch", analysis, sep = ".") + return(cbind(acc_row, DAs)) + }) + + # select relevant rows from ipr input to add to domarch + ipr_select <- ipr_in %>% + select(Name, AccNum, Species, TaxID, Lineage, Lineage_long_na, + Lineage_long, Lineage_med, Lineage_short, ProteinName, + SourceDB, Completeness, AccNum.noV) %>% + distinct() + + # combine domarchs to one data frame, merge w/ acc2info + domarch2 <- do.call(rbind.data.frame, domarch) + + domarch_lins <- domarch2 %>% + merge(ipr_select, by = "AccNum", all.x = T) + + # save domarch_lins file + write_tsv(domarch_lins, + file = paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), + append = F, na = "NA" + ) + + # return domarch2 dataframe to append to blast results if given + return(domarch2) +} + +## function to add results from ipr2DomArch to blast results +appendIPR <- function(ipr_da, blast, prefix) { + # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining ! + blast_out <- read_tsv(blast, col_names = T) + if ("AccNum.noV" %in% colnames(blast_out)) { + ipr_da <- read_tsv(paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), col_names = T) + blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T) + } else { + blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T) + } + + write_tsv(blast_ipr, file = paste0(prefix, "/", prefix, ".full_analysis.tsv"), na = "NA") +} + +# Web BLAST output +web_blast_colnames <- c("Query", "AccNum", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "SStart", "SEnd", + "EValue", "BitScore", "PcPosOrig", + "QSFrames") # specific to "blastx" + +# BLAST Command line +cl_blast_colnames <- c("Query", "SAccNum", "AccNum", + "SAllSeqID", "STitle", "Species", "TaxID", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "QLength", + "SStart", "SEnd", "SLength", + "EValue", "BitScore", "PcPosOrig", + "PcPositive", "ClusterID") # post-cleanup + +# IPRSCAN (web+command-line) +ipr_colnames <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", + "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc") + +# RPSBLAST +rps_colnames <- c("AccNum", "DBID", "DBSeqID", + "PcIdentity", "PcPosOrig", # Ppos missing + "AlnLength", "Mismatch", + # Q here is Subject; S here is the matching domain; rename! + "SStart", "SEnd", "DStart", "DEnd", + "EValue", "BitScore", "TaxID") # TaxID missing (NA); remove? + +# IPG +ipg_colnames <- c("IPG.ID", "Source", "NucAccNum", + "NucStart", "NucStop", "Strand", + "AccNum", "ProtDesc", + "Species", "SppStrain", "AssemblyID") +# Final ColNames +combo_colnames <- c("Query", "AccNum", "Species", "TaxID", "Lineage", + "PcPositive", "ClusterID", + # "Leaf", # MISSING (useful for all dataviz) + # "AssemblyID", "GeneName", "ProtDesc", # MISSING NOW!?! + "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D", + "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP") + + +## ############ ## +## COLUMN names ## +## ############ ## +## BLAST +############ +## Web-BLAST +############ +## Downloaded as HIT-TABLE csv +# BLASTP and related protein BLASTs +web_blastp_hit_colnames <- c( + "Query", "AccNum", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "SStart", "SEnd", + "EValue", "BitScore", "PcPosOrig" +) +# BLASTX +web_blastx_colnames <- c( + "Query", "AccNum", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "SStart", "SEnd", + "EValue", "BitScore", "PcPosOrig", + "QSFrames" +) # specific to "blastx" + +## Downloaded as Descriptions csv +# BLASTP and related protein BLASTs +web_blastp_desc_colnames <- c( + "Description", "Species", "CommonName", "TaxID", + "BitScore", "TotalScore", + "PcQCover", "EValue", "PcIdentity", + "SLen", "AccNum" +) + +##################### +## Command line BLAST +##################### + +# pre-cleanup +cl_blast_colnames <- c( + "Query", "SAccNum", "AccNum", + "SAllSeqID", "STitle", "Species", "TaxID", + "PcIdentity", "AlnLength", "Mismatch", "GapOpen", + "QStart", "QEnd", "QLength", + "SStart", "SEnd", "SLength", + "EValue", "BitScore", "PcPosOrig" +) + +# post-cleanup +cl_blast_postcln_cols <- c( + "Query", "AccNum", + "STitle", "Species", "TaxID", "Lineage", "Lineage_long", + "Lineage_long_na", "Lineage_med", "Lineage_short", + "PcPositive", "PcIdentity", "AlnLength", + "SAccNum", "SAllSeqID", + "Mismatch", "GapOpen", + "QStart", "QEnd", "QLength", + "SStart", "SEnd", "SLength", + "EValue", "BitScore", "PcPosOrig", "QueryName" +) + +########## +## IPRSCAN +########## +# ipr_colnames_orig <- c("AccNum", "Seq_MD5_digest", "SeqLen", "Analysis", +# "DB_ID", "SignDesc", "StartLoc", "StopLoc", "Score", +# "Status", "RunDate", "IPRAcc", "IPRDesc") + +ipr_colnames <- c( + "AccNum", "SeqMD5Digest", "SLength", "Analysis", + "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score", + "Status", "RunDate", "IPRAcc", "IPRDesc" +) + +# post cleanup +########################## +## NEED TO BE REORDERED ## +########################## +ipr_cln_colnames <- c( + "DB.ID", "TaxID", "AccNum.noV", "AccNum", + "SeqMD5Digest", "SLength", "Analysis", "SignDesc", + "StartLoc", "StopLoc", "Score", "Status", "RunDate", + "IPRAcc", "IPRDesc", "FullAccNum", "ProteinName", + "Length", "SourceDB", "Completeness", "Lineage", + "Species", "Name", "ShortName", "LookupTblDesc", + "ID", "Label" +) + +########### +## RPSBLAST +########### +rps_colnames <- c( + "AccNum", "DB.ID", "DBSeqID", + "PcIdentity.Dom", "PcPosOrig.Dom", # "PcPos.Dom", # Ppos missing + "AlnLength", "Mismatch", + "SStart", "SEnd", "DStart", "DEnd", + "EValue", "BitScore", "TaxID" +) # TaxID missing (NA); remove? + +####################### +## IPG and Lineage maps +####################### +ipg_colnames <- c( + "IPG.ID", "Source", "NucAccNum", + "NucStart", "NucStop", "Strand", + "AccNum", "Description", + "Species", "Spp.Strain", "AssemblyID" +) + +################## +## Assembly files +## Genbank, Refseq +################## +assembly_colnames <- c( + "AssemblyID", + "bioproject", "biosample", "wgs_master", # not used + "RefseqCategory", "TaxID", "Spp.TaxID", + "Species", "Spp.Strain", + "isolate", "version_status", # not used + "assembly_level", "release_type", # not used + "GenomeStatus", + "seq_rel_date", "asm_name", "submitter", # not used + "AssemblyID.GBRS", + "paired_asm_comp", "ftp_path", # not used + "excluded_from_refseq", "relation_to_type_material" +) # not used +assembly_sub_colnames <- c( + "TaxID", "Spp.TaxID", "Species", "Spp.Strain", + "RefseqCategory", "GenomeStatus", + "AssemblyID", "AssemblyID.GBRS" +) + +################# +## Lookup tables +## in common_data +################# +lineage_lookup_colnames <- c("TaxID", "Species", "Lineage_long", + "Lineage_long_na", "Lineage_med", + "Lineage_short", "Lineage") +domarch_lookup_colnames <- c("DB.ID", "ShortName", "Description", "ID") +# !! SC and LS will fix other piecemeal files based on these + +###################### +## FINAL UPLOADED DATA +###################### +## Combined data frame that is loaded on to the webapp +combo_colnames <- c( + "Query", "UID", "AccNum", "Species", "TaxID", "Lineage", + "PcPositive", "ClusterID", "QueryName", + # "AssemblyID", "GeneName", "Description", # MISSING NOW!?! + "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D", + "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP", + "DomArch.SMART", "DomArch.TIGR" +) + + +################ +## read tsv colnames +################ +lookup_table_cols <- cols( + DB.ID = col_character(), + ShortName = col_character(), + Description = col_character(), + ID = col_character() +) + +iprscan_cols <- cols( + .default = col_character(), + TaxID = col_double(), + SLength = col_double(), + SignDesc = col_character(), + StartLoc = col_double(), + StopLoc = col_double(), + Score = col_double(), + Status = col_logical(), + IPRAcc = col_character(), + IPRDesc = col_character(), + Length = col_double(), + ShortName = col_character(), + LookupTblDesc = col_character(), + ID = col_character(), + Label = col_character() +) + +ipr_cln_cols <- cols( + .default = col_character(), + TaxID = col_double(), + SLength = col_double(), + StartLoc = col_double(), + StopLoc = col_double(), + Score = col_double(), + Status = col_logical(), + IPRAcc = col_logical(), + IPRDesc = col_logical(), + Length = col_double(), + ID = col_logical() +) + +lineage_map_cols <- c( + "double", + "character", + "character", "character", "character", "character", "character" +) diff --git a/R/summarize.R b/R/summarize.R index 371473ce..b1fef89d 100644 --- a/R/summarize.R +++ b/R/summarize.R @@ -690,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col abort("Error: 'digits' must be a non-negative integer.") } - column <- sym(column) + # column <- sym(column) prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% @@ -698,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") col_count <- prot %>% - group_by({{ column }}) %>% + group_by(!!sym(column)) %>% summarise(totalcount = sum(count)) - total <- left_join(prot, col_count, by = as_string(column)) + total <- left_join(prot, col_count, by = column) sum_count <- sum(total$count) total <- total %>% @@ -901,4 +901,4 @@ findParalogs <- function(prot) { # cat("Word counts for broken up domains from DAs and DAs from GCs. # \nFor e.g.: # DA.doms.wc <- query.sub$DA.doms %>% -# words2WordCounts()") +# words2WordCounts()") \ No newline at end of file diff --git a/R/viz_utils.R b/R/viz_utils.R new file mode 100644 index 00000000..4697204e --- /dev/null +++ b/R/viz_utils.R @@ -0,0 +1,924 @@ +# Author(s): Awa Synthia +# Last modified: 2024 + +# import libs +#' @import dplyr stringr visNetwork DT plotly +#' +# Function to generate the InterProScan Visualization +getIPRGenesVisualization <- function(data, app_data, + query_iprDatabases = c("PfamA", "Phobius", "TMHMM", "Gene3d"), + query_iprVisType = "Analysis") { + + # Check if analysis is loaded + if (nrow(data@df) == 0 || app_data@ipr_path == "") { + stop("Analysis data is not loaded properly or ipr_path is missing.") + } + + # If there is no cln_path or PcPositive column is NULL + if (length(data@cln_path) == 0 || is.null(data@df$PcPositive)) { + + # Set column name for accessing based on the data + n <- if ("name" %in% colnames(data)) { + "name" + } else { + "AccNum" + } + n <- "Name" # Hardcoded to "Name" based on original code + + # Call the `ipr2viz_web` function + ipr_plot <- plotIPR2VizWeb( + infile_ipr = data@ipr_path, + accessions = data@df$Name, + analysis = query_iprDatabases, + group_by = query_iprVisType, + name = n + ) + + } else { + + # Call the `ipr2viz` function with additional arguments + ipr_plot <- plotIPR2Viz( + infile_ipr = data@ipr_path, + infile_full = data@df, + accessions = unique(data@df$Name), + analysis = query_iprDatabases, + group_by = query_iprVisType, + topn = 20, # This value is hardcoded in the original code + query = "All" + ) + } + + # Return the plot object for further use + return(ipr_plot) +} + +# Function to generate the domain network layout visualization +getRSNetworkLayout <- function(data, app_data, + cutoff = 100, + layout = "nice") { + + # Check if analysis is loaded and app data has a valid ipr_path + if (nrow(data@df) == 0 || app_data@ipr_path == "") { + stop("Analysis not loaded or ipr_path missing.") + } + + # Extract column names and find the first matching "DomArch" column + cols <- colnames(data@df) + if (is.null(cols)) { + stop("Dataframe columns are missing.") + } + + col <- cols[grepl("DomArch.*", cols)][1] + if (is.null(col)) { + stop("No domain architecture column found.") + } + + # Clean up domain architecture columns in the data + df_data <- data@df %>% + mutate(across(tidyselect::starts_with("DomArch"), cleanString)) + + # Generate the network using the domain_network function + res_network <- createDomainNetwork( + df_data, + column = col, + domains_of_interest = ".*", + cutoff = cutoff, + layout = layout + ) + + # Validate that the result is not an error + if (any(res_network == "error")) { + stop("Not enough nodes to construct a network.") + } + + return(res_network) +} + +# Function to generate the data table +getDataTable <- function(data) { + if (nrow(data@df) == 0) { + stop("No data available. + Please ensure you have uploaded your data correctly.") + } + + # Define the columns to be shown + viewing_cols <- c( + "AccNum", "QueryName", "Name", "Lineage", "Species", + "Length", "PcPositive", "DomArch.Pfam" + ) + + # Extract the data + d <- data@df + + # Identify columns to hide (those not in viewing_cols) + hide_cols <- which(!colnames(d) %in% viewing_cols) - 1 + + # Add hyperlinks to the "AccNum" column + d$AccNum <- paste0("", d$AccNum, "") + + # Create the DataTable + dt <- DT::datatable(d, + rownames = FALSE, + filter = "top", # Enable filtering + extensions = c("Buttons"), # Enable buttons (e.g., CSV export) + escape = FALSE, # Allow HTML rendering (for the hyperlinks) + options = list( + dom = "frlBtip", # Layout for the table (filters, buttons, etc.) + pageLength = 10, # Number of rows per page + paging = TRUE, # Enable pagination + # Enable regex search + search = list(caseInsensitive = TRUE, regex = TRUE), + language = list( + searchPlaceholder = "Regex or Text...", + filterPlaceholder = "Test" + ), + buttons = list( + list( + extend = "colvis", + text = "Add/remove column(s)" + ), + list( + extend = "csv", + text = "Download", + filename = "molevolvr-data", + exportOptions = list( + # Export all data, not just visible page + modifier = list(page = "all") + ) + ) + ), + scrollX = FALSE, # Disable horizontal scrolling + fixedHeader = FALSE, # Disable fixed header + columnDefs = list( + # Hide columns not in 'viewing_cols' + list(visible = FALSE, targets = hide_cols) + ) + ) + ) + + return(dt) +} + +# Function to generate query data table +getQueryDataTable <- function(query_data, query_select = NULL) { + + # Check if analysis is loaded and data is available + if (nrow(query_data@df) == 0) { + stop("No data available. Please ensure you have uploaded your data + correctly.") + } + + # Define the columns to be shown + viewing_cols <- c("QueryName", "Species", "Lineage", "DomArch.Pfam") + + # If no specific queries are selected, display all data + if (is.null(query_select)) { + d <- query_data@df + } else { + # Filter the data for selected queries + d <- query_data@df %>% filter(QueryName %in% query_select) + d <- droplevels(d) + } + + # Identify columns to hide (those not in viewing_cols) + hide_cols <- which(!colnames(d) %in% viewing_cols) - 1 + + # Add hyperlinks to the "Query" column (replace 'Query' with the actual + # column name if different) + d$Query <- paste0("", d$Query, "") + + # Create and return the DataTable + dt <- DT::datatable(d, + rownames = FALSE, + filter = "top", # Enable filtering + extensions = c("Buttons"), # Enable buttons (e.g., CSV export) + callback = DT::JS(c( + '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );' + )), + escape = FALSE, # Allow HTML rendering (for the hyperlinks) + options = list( + dom = "frlBtip", # Layout for the table (filters, buttons, etc.) + pageLength = 25, # Number of rows per page + paging = TRUE, # Enable pagination + # Enable regex search + search = list(caseInsensitive = TRUE, regex = TRUE), + language = list( + searchPlaceholder = "Regex or Text..." + ), + buttons = list( + list( + extend = "colvis", + text = "Add/remove column(s)" + ), + list( + extend = "csv", + text = "Download", + filename = "molevolvr-query", + exportOptions = list( + # Export all data, not just visible page + modifier = list(page = "all") + ) + ) + ), + scrollX = FALSE, # Disable horizontal scrolling + fixedHeader = FALSE, # Disable fixed header + columnDefs = list( + # Hide columns not in 'viewing_cols' + list(visible = FALSE, targets = hide_cols) + ) + ) + ) + + return(dt) +} + +# Function to read and return the FASTA file contents +readFastaData <- function(fasta_path) { + + # Check if analysis is loaded and the file path is not empty + if (fasta_path == "" || !file.exists(fasta_path)) { + stop("FASTA file path is invalid or the file does not exist.") + } + + # Read the content of the FASTA file + fasta_content <- read_file(fasta_path) + + return(fasta_content) +} + +getFastaData <- function(fasta_path) { + if (is.null(fasta_path) || fasta_path == "") { + stop("Error: FASTA path is not provided.") + } + return(read_file(fasta_path)) # Read and return the FASTA data +} + +# Function to get domain sequences (assumes `data` is a predefined object) +getDomData <- function(data) { + return(data@domainSeqs) # Return domain sequences +} + +# Function to get MSA data from a given path +getMSAData <- function(msa_path) { + if (is.null(msa_path) || msa_path == "") { + stop("Error: MSA path is not provided.") + } + # Attempt to read the file and handle potential errors + if (file.exists(msa_path)) { + return(read_file(msa_path)) + } else { + warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path)) + return(NULL) # Return NULL if the file cannot be read + } +} + +# Function to generate a heatmap +getQueryHeatmap <- function(query_data_df, + heatmap_select = "All", + heatmap_color = "blue") { + + # Check if analysis is loaded and query data exists + if (nrow(query_data_df) == 0) { + stop("No query data available.") + } + + # Filter queries based on user selection + if (heatmap_select == "All") { + queries <- unique(query_data_df$QueryName) + prot <- query_data_df %>% + filter(Lineage != "") %>% + tidyr::drop_na(Lineage) + } else { + queries <- heatmap_select + prot <- query_data_df %>% + filter(grepl(heatmap_select, QueryName, ignore.case = TRUE)) %>% + filter(Lineage != "") %>% + tidyr::drop_na(Lineage) + } + + # Validate that the Lineage column has values + if (all(unique(prot$Lineage) == "")) { + stop("Lineage column not found for selected proteins. + See the FAQ for possible reasons/solutions.") + } + + plotLineageQuery(prot, queries = queries, colname = "QueryName", + cutoff = 100, color = heatmap_color) +} + +# Function to retrieve domain architecture columns +getQueryDomArchCols <- function(query_data_df) { + # Check if query data exists + if (nrow(query_data_df) == 0) { + stop("No query data available.") + } + + # Get column names + cols <- colnames(query_data_df) + + # Filter domain architecture columns, excluding repeats + domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)] + + # Identify columns that are completely NA + na_cols <- names(query_data_df)[apply(query_data_df, 2, + function(x) all(is.na(x)))] + + # Remove NA columns from domain architecture columns + domarch_cols <- setdiff(domarch_cols, na_cols) + + # Include SignalP_GRAM_POSITIVE if present + if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) { + domarch_cols <- setdiff(domarch_cols, "SignalP_GRAM_POSITIVE") # Remove first + domarch_cols <- append(domarch_cols, "SignalP_GRAM_POSITIVE") # Append it last + } + + # Remove prefix from column names + domarch_cols <- substring(domarch_cols, first = 9) + + return(domarch_cols) +} + +# Function to generate main data table +getMainTable <- function(data_df, main_select = NULL) { + # Validate input data + if (nrow(data_df) == 0) { + stop("No data available. Please ensure you have uploaded your data + correctly. See Help documentation or contact JRaviLab + (janani.ravi[AT]cuanschutz[DOT]edu).") + } + + # Define columns to view + viewing_cols <- c("AccNum", "QueryName", "Name", "Lineage", + "Species", "Length", "PcPositive", + "DomArch.Pfam") + + # Filter data based on selection + if (!is.null(main_select)) { + d <- data_df %>% + filter(QueryName %in% main_select) %>% + droplevels() + } else { + d <- data_df + } + + # Identify columns to hide + hide_cols <- which(!colnames(d) %in% viewing_cols) + hide_cols <- hide_cols - 1 + + # Create hyperlinks for AccNum + d$AccNum <- paste0("", d$AccNum, "") + + # Generate DataTable + datatable_output <- DT::datatable(d, + rownames = FALSE, + filter = "top", + callback = DT::JS(c( + '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );' + )), + extensions = c("Buttons"), + escape = FALSE, + options = list( + dom = "frlBtip", + pageLength = 25, + paging = TRUE, + search = list(caseInsensitive = TRUE, regex = TRUE), + language = list(searchPlaceholder = "Regex or Text...", + filterPlaceholder = "Test"), + buttons = list( + list(extend = "colvis", text = "Add/remove column(s)"), + list( + extend = "csv", + text = "Download", + filename = "molevolvr-homolog", + exportOptions = list(modifier = list(page = "all")) + ) + ), + scrollX = FALSE, + fixedHeader = FALSE, + columnDefs = list(list(visible = FALSE, targets = hide_cols)) + ) + ) + + return(datatable_output) +} + +totalCounts <- function(prot, column = "DomArch", lineage_col = "Lineage", + cutoff = 90, RowsCutoff = FALSE, digits = 2 + # type = "GC" +) { + # column <- sym(column) + + prot <- select(prot, {{ column }}, {{ lineage_col }}) %>% + filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>% + filter({{ column }} != "") + + prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all") + col_count <- prot %>% + group_by(!!sym(column)) %>% + summarise(totalcount = sum(count)) + + total <- left_join(prot, col_count, by = column) + + sum_count <- sum(total$count) + total <- total %>% + mutate("IndividualCountPercent" = totalcount / sum_count * 100) %>% + arrange(-totalcount, -count) + + cumm_percent <- total %>% + select({{ column }}, totalcount) %>% + distinct() %>% + mutate("CumulativePercent" = 0) + total_counter <- 0 + for (x in length(cumm_percent$totalcount):1) { + total_counter <- total_counter + cumm_percent$totalcount[x] + cumm_percent$CumulativePercent[x] <- total_counter / sum_count * 100 + } + + cumm_percent <- cumm_percent %>% select(CumulativePercent, {{ column }}) + + total <- total %>% left_join(cumm_percent, by = as_string(column)) + + # Round the percentage columns + total$CumulativePercent <- total$CumulativePercent %>% round(digits = digits) + total$IndividualCountPercent <- total$IndividualCountPercent %>% round(digits = digits) + + if (RowsCutoff) { + # If total counts is being used for plotting based on number of rows, + # don't include other observations that fall below the cummulative percent cutoff + # , but that have the same 'totalcount' number as the cutoff observation + total <- total %>% filter(CumulativePercent >= 100 - cutoff - .0001) + return(total) + } + + # Include observations that fall below the cummulative percent cutoff, + # but that have the same 'totalcount' as the cutoff observation + t <- total %>% filter(CumulativePercent >= 100 - cutoff) + if (length(t) == 0) { + cutoff_count <- 0 + } else { + cutoff_count <- t$totalcount[nrow(t)] + } + + total <- total %>% + filter(totalcount >= cutoff_count) %>% + ungroup() + + return(total) +} + +getDomArchTotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) { + # Check if ipr_path is not empty + if (app_data@ipr_path == "") { + stop("ipr_path is missing.") + } + + # Calculate total counts with the specified cutoff and column + prot_tc <- totalCounts(DA_Prot, cutoff = DACutoff, column = DA_col) + + # Replace all instances of ">" in the Lineage column with "_" + prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>% + unlist() + + # Return the processed data + prot_tc +} + +# Function to generate Domain Architecture Linear Table +getDomArchLinearTable <- function(DA_col, ipr_path, DA_TotalCounts_value) { + # Check if ipr_path is valid + if (ipr_path == "") { + stop("InterPro path is empty.") + } + + # Check if DA_col is provided + if (is.null(DA_col)) { + stop("DA_col input is required.") + } + + da_col <- sym(DA_col) + + # Perform the data transformation + DA_TotalCounts_value %>% + group_by({{ da_col }}, totalcount, CumulativePercent) %>% + summarize(LineageCount = n()) %>% + select({{ da_col }}, LineageCount, totalcount, CumulativePercent) %>% + arrange(-totalcount) + + # Generate the DAlin count table + DAlin_table <- DT::datatable( + DA_TotalCounts_value, + selection = "single", + extensions = c("Buttons"), + options = list( + pageLength = 25, + dom = "frlBtip", + buttons = list( + list( + extend = "csv", + text = "Download", + filename = "MolEvolvR_domarch", + exportOptions = list( + modifier = list(page = "all") + ) + ) + ), + scrollX = FALSE, + paging = TRUE, + fixedHeader = FALSE, + fixedColumns = list(leftColumns = 0, rightColumns = 0) + ) + ) + + return(DAlin_table) +} + +# Function to generate the Domain Architecture Lineage Plot +getDomArchHeatmapPlot <- function(DA_col, DACutoff, + DA_Prot, DA_lin_color, + ipr_path) { + # Check if ipr_path is valid + if (ipr_path == "") { + stop("InterPro path is empty.") + } + + # Filter the protein data for plotting + prot <- DA_Prot %>% + filter(Lineage != "") %>% + drop_na(Lineage) + + # Create the plot + plot <- plotLineageDA( + prot, + colname = DA_col, + cutoff = DACutoff, + RowsCutoff = FALSE, + color = DA_lin_color + ) + + # Convert ggplot to plotly object + plot <- plotly::ggplotly(plot) + plot <- plot %>% plotly::layout(xaxis = list(side = "top")) + + return(plot) +} + +# Function to generate the Domain Architecture Network +getDomNetwork <- function(DA_col, DACutoff, DA_Prot, + networkLayout, ipr_path) { + # Check if ipr_path is valid + if (ipr_path == "") { + stop("InterPro path is empty.") + } + + # Prepare the selected protein data + dn_data <- DA_Prot + col <- sym(DA_col) # Convert DA_col to a symbol for use with dplyr + dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_") + + # Filter data based on the specified column + dn_data <- dn_data %>% + drop_na(!!col) %>% + filter(!!col != "") + + # Generate the domain network + res <- createDomainNetwork( + prot = dn_data, + column = DA_col, + domains_of_interest = ".*", + cutoff = DACutoff, + layout = networkLayout + ) + + # Validate the result + if (any(res == "error")) { + stop("Not enough nodes to construct a network. + Try increasing 'Total Count Cutoff'.") + } + + # Add export functionality + visNetwork::addExport(res) + res <- res %>% visExport( + type = "png", + name = "domain_architecture-network", + float = "left", + label = "Download plot", + style = " + background-color: #ffffff; + color: #333333; + border-color: #cccccc; + padding: 6px 12px; + font-size: 14px; + line-height: 1.42857143; + border-radius: 4px; + cursor: pointer; + display: inline-block; + margin-bottom: 0; + font-weight: normal; + text-align: center; + vertical-align: middle; + touch-action: manipulation; + user-select: none; + background-image: none; + text-decoration: none; + " + ) + + return(res) +} + +# Function to retrieve and clean Domain Architecture data +getDomArchProt <- function(app_data, DASelect) { + # Check if the ipr_path is valid + if (app_data@ipr_path == "") { + stop("InterPro path is empty.") + } + + # Validate domain architecture + # Check if ipr_path is not empty + if (app_data@ipr_path == "") { + stop("ipr_path is missing.") + } + + # Check if the data frame has rows + if (nrow(app_data@df) <= 0) { + stop("No data available. Please ensure you have uploaded your data correctly. See Help documentation or contact JRaviLab (janani.ravi[AT]cuanschutz[DOT]edu).") + } + + # Check if there are domain architecture columns + # if (length(domarch_cols) == 0) { + # stop("Please ensure uploaded data has domain architecture columns.") + # } + + # Retrieve app data and clean domain architecture columns + df_app_data <- app_data@df + + # Domain architecture column cleanup + df_app_data <- df_app_data %>% + mutate(across(starts_with("DomArch"), cleanString)) + + # Filter based on user selection + if (DASelect == "All") { + return(df_app_data) + } else { + return(df_app_data %>% filter(grepl(DASelect, QueryName, + ignore.case = TRUE))) + } +} + +# Function to retrieve domain architecture columns +getDomArchCols <- function(app_data, DASelect) { + # Check if app data DataFrame is not empty + if (nrow(app_data@df) <= 0) { + stop("No data available in app data.") + } + + # Check if ipr_path is valid + if (app_data@ipr_path == "") { + stop("InterPro path is empty.") + } + + # Get column names + cols <- colnames(app_data@df) + + # Filter domain architecture columns + domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)] + + # Retrieve the data frame + domarch_data <- app_data@df + + # Filter data frame based on user selection + if (DASelect != "All") { + domarch_data <- domarch_data %>% filter(QueryName == DASelect) + } + + # Identify columns that are completely NA or empty + na_cols <- apply(domarch_data, 2, function(x) { + all(is.na(x)) || all(x == "") + }) + na_cols <- names(domarch_data)[na_cols] + + # Remove NA columns from the domain architecture columns + domarch_cols <- setdiff(domarch_cols, na_cols) + + # Ensure "SignalP_GRAM_POSITIVE" is included if it exists + if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) { + domarch_cols <- append(domarch_cols[!domarch_cols %in% + "SignalP_GRAM_POSITIVE"], + "SignalP_GRAM_POSITIVE") + } + + return(domarch_cols) +} + +# Function to generate the IPR genes plot +getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases, + query_iprVisType, DASelect) { + + if (app_data@ipr_path == "") { + stop("IPR path is not set.") + } + + # Validate the data frame + df <- app_data@df + if (nrow(df) == 0) { + stop("No data available. Please ensure you have uploaded + your data correctly.") + } + + if (is.null(query_iprDatabases) || length(query_iprVisType) == 0) { + stop("Please select an analysis.") + } + + # Determine the name to use + if (length(app_data@cln_path) == 0 || is.null(df$PcPositive)) { + name_column <- ifelse("name" %in% colnames(df), "name", "AccNum") + name_column <- "Name" + + # Generate the plot using the web version + plot <- plotIPR2VizWeb( + infile_ipr = app_data@ipr_path, + accessions = df$Name, + analysis = query_iprDatabases, + group_by = query_iprVisType, + name = name_column + ) + } else { + # Generate the plot using the local version + plot <- plotIPR2Viz( + infile_ipr = app_data@ipr_path, + infile_full = df, + accessions = unique(df$Name), + analysis = query_iprDatabases, + group_by = query_iprVisType, + topn = 20, + query = DASelect + ) + } + + return(plot) +} + +# Function to filter proteins for phylogeny +filterPhylogenyProteins <- function(app_data, phylo_select) { + # Get the data frame from app_data + df <- app_data@df + + # Check if the data frame is empty + if (nrow(df) == 0) { + stop("No data available in app_data. + Please ensure it has been loaded correctly.") + } + + # Filter the data based on user selection + if (phylo_select == "All") { + filtered_df <- df + } else { + filtered_df <- df %>% + filter(grepl(phylo_select, QueryName, ignore.case = TRUE)) %>% + filter(Lineage != "") + } + + return(filtered_df) +} + +# Function to retrieve representative accession numbers +getRepAccNum <- function(app_data, phylo_select, + msa_reduce_by, msa_rep_num, + rval_phylo) { + + if (rval_phylo()) { + return(app_data@df$AccNum) + } else { + switch(msa_reduce_by, + "Species" = rep_acc_species(), + "Lineage" = rep_acc_lineage(), + "DomArch" = { + if (app_data@ipr_path == "") { + stop("IPR path is empty. Please ensure it is set correctly.") + } + seqs <- find_top_acc(infile_full = app_data@df, + n = msa_rep_num, + query = phylo_select) + if (is.null(seqs)) { + stop("No sequences found.") + } + return(seqs) + }, + "Full" = { + tmp <- app_data@df %>% filter(QueryName == phylo_select) + return(tmp$AccNum) + }, + stop("Invalid selection for msa_reduce_by.") + ) + } +} + +getDomArchPlot <- function(ipr_path, query_names, + analysis_type, group_by) { + # Check if the input path is provided + if (is.null(ipr_path) || ipr_path == "") { + stop("Error: Input path for IPR data is not provided.") + } + + # Validate that at least one domain is found + if (length(query_domarch_cols()) < 1) { + stop("Error: No domains found in the input sequences.") + } + + n <- "Name" + plot <- plotIPR2VizWeb( + infile_ipr = ipr_path, + accessions = query_names, + analysis = analysis_type, + group_by = group_by, + name = n + ) + + return(plot) # Return the generated plot +} + +# Function to convert accessions to names +acc2Name <- function(app_data) { + # Check if "AccNum" is a column in the data + if (!("AccNum" %in% colnames(app_data@df))) { + stop("Column 'AccNum' not found in data.") + } + + # Check if "Name" column exists and create the output data frame + if ("Name" %in% colnames(app_data@df)) { + # Select "AccNum" and "Name" + df <- select(app_data@df, "AccNum", "Name") + } else { + # Create a data frame with "AccNum" and assign "AccNum" to "Name" + df <- app_data@df %>% + select("AccNum") %>% + mutate(Name = AccNum) + } + + return(df) +} + +repAccNums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) { + # If `phylo` is true, return all `AccNum` values from `app_data` + if (phylo) { + return(app_data@df$AccNum) + } else { + # Switch based on `msa_reduce_by` value + tmp <- filter(app_data@df) + rep_acc_species <- createRepresentativeAccNum(tmp, reduced = "Species", accnum_col = "AccNum") + + # Get representative accession numbers by "Lineage" + rep_acc_lineage <- createRepresentativeAccNum(tmp, reduced = "Lineage", accnum_col = "AccNum") + switch(msa_reduce_by, + "Species" = rep_acc_species, + "Lineage" = rep_acc_lineage, + "DomArch" = { + # Check if `ipr_path` is not empty + if (app_data@ipr_path == "") { + stop("ipr_path is missing.") + } + # Find top sequences by accession number based on criteria + seqs <- getTopAccByLinDomArch(infile_full = app_data@df, n = msa_rep_num, query = PhyloSelect) + if (is.null(seqs)) { + stop("No sequences found.") + } + return(seqs) + }, + "Full" = { + # Filter data for matching `QueryName` and return `AccNum` + tmp <- app_data@df %>% filter(QueryName == PhyloSelect) + return(tmp$AccNum) + } + ) + } +} + +seqTree <- function(fasta_filepath){ + my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T) + my_seqs_msa <- msa(my_seqs) + my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment") + + #below was commented out, does it need to change as one of the parameters? the bottom keeps + d <- dist.alignment(my_seqs_msa_aln, "identity") + #as.matrix(d)[2:5, "HBA1_Homo_sapiens", drop=FALSE] + + ## Phylogenetic tree + ## using package ape + ## build neighbor-joining tree + seqTree <- nj(d) + #plot(seqTree, main="Phylogenetic Tree of MSA") + groupInfo <- split(seqTree$tip.label, + gsub("_\\w+", "", seqTree$tip.label)) + seqTree <- groupOTU(seqTree, groupInfo) + # ggtree(seqTree, aes(color=group), + # layout='circular') + + # geom_tiplab(size=1, aes(angle=angle)) + #https://yulab-smu.top/treedata-book/chapter4.html + #offs <- 0 + tree <- ggtree(seqTree, branch.length = "dN_vs_dS") + theme_tree2(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank()) + msaplot(tree, fasta=fasta_filepath, offset=0.5, bg_line = TRUE) + geom_tiplab(align=TRUE, linesize=0.5, size=3) + +} diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd new file mode 100644 index 00000000..ab02bdeb --- /dev/null +++ b/inst/report/report_template.Rmd @@ -0,0 +1,497 @@ +--- +title: MolEvolvR +output: + html_document: + self_contained: yes +params: + rs_interproscan_visualization: default + proximity_network: default + sunburst: default + data: default + queryDataTable: default + fastaDataText: default + heatmap: default + query_data: default + query_domarch_cols: default + query_iprDatabases: default + query_iprVisType: default + mainTable: default + DALinTable: default + DALinPlot: default + DANetwork: default + DA_Prot: default + domarch_cols: default + DA_Col: default + DACutoff: default + da_interproscan_visualization: default + phylo_sunburst_levels: default + phylo_sunburst: default + lineage_table: default + ## Tree + tree_msa_tool: default ##input + ## MSA + repAccNums: default + msa_rep_num: default ##input + app_data: default + PhyloSelect: default ##input + acc2Name: default + rval_phylo: default + query_pin: default + msa_reduce_by: default +--- + +--- +subtitle: "`r glue::glue('Retrieval Code: {params$query_pin}')`" +--- + + + + + +```{r mine,results="asis",engine="js", echo=FALSE} +// stumble through the tabset(s) +setTimeout(function(){ + document.querySelector('a#FullResultButton1').addEventListener('click', doIt); + function doIt(){ + document.querySelector('a[href*="#domain-architecture-2"]').click(); //Second(-2) Domain Architecture Section (reuse at different heading levels) + } +}, 100); // slow your roll + +setTimeout(function(){ + document.querySelector('a#FullResultButton2').addEventListener('click', doIt); + function doIt(){ + document.querySelector('a[href*="#phylogeny-1"]').click(); //Second Phylogeny(-1) Section (reuse at different heading levels) + } +}, 100); + +setTimeout(function(){ + document.querySelector('a#FullResultButton3').addEventListener('click', doIt); + function doIt(){ + document.querySelector('a[href*="#homolog-data"]').click(); //Homolog Data Section + } +}, 100); +``` + +```{r setup, include = FALSE} +## Globally Suppress warnings and messaging +knitr::opts_chunk$set(warning = FALSE, message = FALSE) +## Create Sunburst Report Function +cpcols <- c( + "#AFEEEE", "#DDA0DD", "#EE2C2C", "#CDBE70", "#B0B099", + "#8B2323", "#EE7600", "#EEC900", "chartreuse3", "#0000FF", + "#FFD900", "#32CD32", "maroon4", "cornflowerblue", "darkslateblue", + "#AB82FF", "#CD6889", "#FFA07A", "#FFFF00", "#228B22", + "#FFFFE0", "#FFEC8B", "peru", "#668B8B", "honeydew", + "#A020F0", "grey", "#8B4513", "#191970", "#00FF7F", + "lemonchiffon", "#66CDAA", "#5F9EA0", "#A2CD5A", "#556B2F", + "#EEAEEE", "thistle4", "#473C8B", "#FFB6C1", "#8B1C62", + "#FFE4B5", "black", "#FF7F50", "#FFB90F", "#FF69B4", "#836FFF", + "#757575", "#CD3333", "#EE7600", "#CDAD00", "#556B2F", "#7AC5CD" + ) +lineage_sunburst_report <- function(prot, lineage_column = "Lineage", + type = "sunburst", + levels = 2, colors = NULL, legendOrder = NULL, showLegend = TRUE, maxLevels = 5) { + lin_col <- sym(lineage_column) + + # ensure they don't exceed maxLevels, although this should + if (!is.null(maxLevels) && levels > maxLevels) { + levels <- maxLevels + } + + levels_vec <- c() + for (i in 1:levels) + { + levels_vec <- append(levels_vec, paste0("level", i)) + } + + # Take lineage column and break into the first to levels + prot <- prot %>% + select({{ lin_col }}) %>% + arrange(desc({{ lin_col }})) %>% + drop_na({{ lin_col }}) + protLevels <- prot %>% separate({{ lin_col }}, into = levels_vec, sep = ">") + # Count the occurrance of each group of levels + protLevels <- protLevels %>% + group_by_at(levels_vec) %>% + summarise(size = n()) + protLevels <- protLevels %>% arrange() + tree <- d3_nest(protLevels, value_cols = "size") + + # Plot sunburst + if (type == "sunburst") { + result <- sunburst(tree, legend = list(w = 225, h = 15, r = 5, s = 5), colors = cpcols, legendOrder = legendOrder) + } else if (type == "sund2b") { + result <- sund2b(tree) + } + + if (showLegend) { + return( + htmlwidgets::onRender( + result, + "function(el, x) { + jQuery('.sunburst-togglelegend', el) + .attr('checked', 'true') + .attr('data-html2canvas-ignore', 'true'); + jQuery('.sunburst-legend', el).css('visibility', ''); + + // create a button to download the sunburst + // (relies on html2canvas being included in the page) + + // FIXME: consider pulling this all out into a js library + // so that we can apply it to other components. + + const downloadBtn = jQuery('') + .css({'position': 'absolute', 'right': '5px', 'top': '5px'}) + .attr('data-html2canvas-ignore', 'true') + .appendTo(el); + + saveAs = (blob, fileName) => { + const link = document.createElement('a'); + link.download = fileName + link.href = URL.createObjectURL(blob); + link.click(); + URL.revokeObjectURL(link.href); + } + + downloadBtn.click(() => { + html2canvas(el, { scale: 4, logging: false }).then(canvas => { + canvas.toBlob(function(blob) { + saveAs(blob, 'sunburst.png'); + }); + }); + }); + }" + ) + ) + } + + return(result) +} +``` + +# {.tabset} + +## Results Summary + +An overview of the protein analysis. To view the full results, explore the additional tabs. + +### Domain Architecture + +Visualizations and summaries for protein domains. + +View full results + + +#### Interproscan Visualization + +```{r, echo=FALSE} +params$rs_interproscan_visualization +``` + +#### Proximity Network + +```{r, echo=FALSE} +params$proximity_network +``` + +### Phylogeny + +Visualizations for protein evolution. + +View full results + +#### Sunburst + +```{r, echo=FALSE} +library(dplyr) +library(tidyr) +lineage_sunburst_report(params$sunburst, "Lineage", levels = 2) +``` + +### Data + +Summary table of proteins including domain architectures, phylogeny, and homologs, when applicable. + +View full results + +```{r, echo=FALSE} +params$data +``` + + +## Query Data {.tabset} + +Input data, additional metadata, and preliminary analyses of query protein(s). + +### Data Table + +The data table provides a summary of the sequences submitted, or \"queried\", for analysis. The preview shown can be extended by using \"Add/remove column(s)\" to see info about other taxonomic classes as well as domain architecuture codes from databases other than the default (Pfam). + +```{r, echo=FALSE} +params$queryDataTable +``` + +### FASTA + +Uploaded amino acid FASTA sequence(s). + +`r glue::glue(params$fastaDataText)` + +### Query Heatmap + +A heatmap of submitted sequences and their respective taxonomic lineages. + +```{r, echo=FALSE} +params$heatmap +``` + +### Domain Architecture + +Visualizations and analyses of all query and homologous protein domains, structural or functional subunits, and their architectures. + +```{r, echo=FALSE} +if(is.null(params$query_iprDatabases)){ + choices <- params$query_domarch_cols + if ("PfamA" %in% choices & "Phobius" %in% choices) { + analysis_type <- c("PfamA", "Phobius") + } else { + default <- choices[1] + } + } else { + analysis_type <- params$query_iprDatabases + } + +if(is.null(params$query_iprVisType)){ + analysis_group <- 'Analysis' + } else { + analysis_group <- params$query_iprVisType + } + +if(length(params$query_domarch_cols) >= 1 ) { + DomArchPlot <- plotIPR2VizWeb( + infile_ipr = params$query_data@ipr_path, + accessions = params$query_data@df$QueryName, + analysis = analysis_type, group_by = analysis_group, name = "Name" + ) + DomArchPlot +} else { + "No domains found in the input sequences." + } +``` + +## Homolog Data + +Full set of homologs of query sequences, including their lineage and domain architecture info. + +```{r, echo=FALSE} +params$mainTable +``` + +## Domain Architecture {.tabset} + +Summary and visualizations of protein motifs/subunits (domains) and their configurations within the query protein(s) (domain architectures). + +### Table + +```{r, echo=FALSE} +params$DALinTable +``` + +### Heatmap + +```{r, echo=FALSE} +params$DALinPlot +``` + +### Network + +```{r, echo=FALSE} +params$DANetwork +``` + +```{r, echo=FALSE} +if (length(params$domarch_cols) == 0) { + choices <- c("None") + vals <- "None" + selected <- "None" + } else { + vals <- params$domarch_cols + choices <- substring(vals, first = 9) + selected <- if (length(choices) >= 2) choices[1:2] else choices[1] + } + +if (is.null(params$DA_col)) { + da_col <- vals[[1]] + } else { + da_col <- params$DA_Col + } + +plot_data <- params$DA_Prot +plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_") +createWordCloudElement(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F) +``` + +### Interproscan Visualization + +```{r, echo=FALSE} +params$da_interproscan_visualization +``` + +### UpSet Plot + +```{r, echo=FALSE} +upset_plot_data <- params$DA_Prot +upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_") +final_plot <- plotUpSet(upset_plot_data, colname = da_col, cutoff = params$DACutoff) +domains <- upset_plot_data %>% + dplyr::pull(da_col) +n_unique_domains <- domains %>% + unique() %>% + length() + +if (n_unique_domains > 1) { + final_plot + } else { + stringr::str_glue("UpSet plot requires more than 1 unique domain (only {n_unique_domains} present). Try selecting more proteins.") + } +``` + +## Phylogeny {.tabset} + +Visualizations of phyletic patterns, sequence similarity, and evolution of related proteins. + +### Sunburst + +```{r, echo=FALSE} +lineage_sunburst_report(params$phylo_sunburst, lineage_column = "Lineage_long_na", type = "sunburst", levels = params$phylo_sunburst_levels) +``` + +### Tree + +```{r, include=FALSE} +if (is.null(params$tree_msa_tool)) { + tree_msa_tool <- 'ClustalO' ## initialize with default value + } else { + tree_msa_tool <- params$tree_msa_tool + } + +if (is.null(params$msa_rep_num)) { + msa_rep_num <- 3 ## initialize with default value + } else { + msa_rep_num <- params$msa_rep_num + } +``` + +```{r, include=FALSE} +if(length(params$repAccNums) >= 3 ) { + rep <- params$repAccNums[1:msa_rep_num] + if (!params$rval_phylo) { + seqs <- readAAStringSet(params$app_data@fasta_path) + names(seqs) <- sub(" .*", "", names(seqs)) + query_accession <- params$app_data@df %>% filter(!duplicated(QueryName)) + query_accession <- unique(query_accession$Query) + query <- seqs[query_accession] + names(query) <- unique(params$app_data@df$QueryName) + query <- AAStringSet(query) + } + # Generate Fasta File + rep_fasta_path <- tempfile() + + acc2FA(rep, outpath = rep_fasta_path, "sequential") + rename_fasta(rep_fasta_path, rep_fasta_path, + replacement_function = mapAcc2Name, + acc2name = params$acc2Name + ) + if (!params$rval_phylo) { + writeXStringSet(query, rep_fasta_path, append = TRUE) + } + rep_msa_path <- tempfile() + alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path) +} +``` + +```{r, echo=FALSE} +library(msa) +library(ape) +library(tidytree) +library(ggtree) +if(length(params$repAccNums) >= 3 ) { + seqTree(fasta_filepath = rep_msa_path) + } else {"Not enough representative sequences: try changing the 'Reduce By' field."} +``` + +### MSA + +```{r, include=FALSE} +if(length(params$repAccNums) >= 3 ) { + if (!params$rval_phylo) { + seqs <- readAAStringSet(params$app_data@fasta_path) + names(seqs) <- sub(" .*", "", names(seqs)) + query_accession <- params$app_data@df %>% filter(!duplicated(QueryName)) + query_accession <- unique(query_accession$Query) + query <- seqs[query_accession] + # names(query) <- params$PhyloSelect + query <- AAStringSet(query) + } + + # Generate Fasta File + rep_fasta_path <- tempfile() + acc2FA(rep, outpath = rep_fasta_path, "sequential") + rename_fasta(rep_fasta_path, rep_fasta_path, + replacement_function = mapAcc2Name, + acc2name = params$acc2Name + ) + if (!params$rval_phylo) { + writeXStringSet(query, rep_fasta_path, append = TRUE) + } + + # Call MSA2PDF + msa_pdf_path <- tempfile() + msa_parent_dir <- dirname(params$app_data@fasta_path) + msa_subdir_path <- file.path(msa_parent_dir, "msa_figs") + if (!dir.exists(msa_subdir_path)) { + dir.create(msa_subdir_path, recursive = TRUE) + message("Subdirectory 'msa_figs' created at: ", msa_subdir_path) + } else { + message("Subdirectory 'msa_figs' already exists at: ", msa_subdir_path) + } + msa_prefix <- paste0(msa_subdir_path, "/") + post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_") + + msa_pdf_path <- paste0(msa_prefix, post_fix) + + createMSA_PDF(fasta_path = rep_fasta_path, msa_pdf_path) + } +``` + +```{r msa_pdf, echo = FALSE, out.width = "95%", out.height = "800px"} +# apt-get install texlive +if(length(params$repAccNums) >= 3 ) { + knitr::include_graphics(msa_pdf_path) + } else {"Not enough representative sequences: try changing the 'Reduce By' field."} +``` diff --git a/man/combineFiles.Rd b/man/combineFiles.Rd index 81464fa6..afbc6c5f 100644 --- a/man/combineFiles.Rd +++ b/man/combineFiles.Rd @@ -20,7 +20,7 @@ The search is recursive (i.e., it will look in subdirectories as well).} Default is "*full_analysis.tsv".} \item{delim}{Character. The delimiter used in the input files. -Default is tab ("\t").} +Default is tab \code{"\t"}.} \item{skip}{Integer. The number of lines to skip at the beginning of each file. Default is 0.} diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd index 0eeb0610..9e91d606 100644 --- a/man/getTopAccByLinDomArch.Rd +++ b/man/getTopAccByLinDomArch.Rd @@ -6,7 +6,7 @@ \usage{ getTopAccByLinDomArch( infile_full, - DA_col = "DomArch.Pfam", + DA_col = "DomArch.PfamA", lin_col = "Lineage_short", n = 20, query