diff --git a/.gitignore b/.gitignore
index ef11006e..74dbc757 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
.Rproj.user
docs
+common_data
.Rhistory
-.DS_Store
\ No newline at end of file
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
index 0fa9a949..a30025de 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -40,6 +40,7 @@ Imports:
DBI,
doParallel,
dplyr,
+ DT,
forcats,
fs,
furrr,
@@ -51,6 +52,8 @@ Imports:
here,
htmltools,
htmlwidgets,
+ httr,
+ httr2,
igraph,
latexpdf,
magrittr,
@@ -73,6 +76,7 @@ Imports:
tibble,
tidyr,
tinytex,
+ tidyverse,
tools,
UpSetR,
viridis,
@@ -82,4 +86,5 @@ Imports:
XVector,
yaml
Suggests:
- knitr
+ knitr,
+ iprscanr
diff --git a/NAMESPACE b/NAMESPACE
index 6ae464f6..a5a44a79 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -49,6 +49,7 @@ export(findParalogs)
export(formatJobArgumentsHTML)
export(generateAllAlignments2FA)
export(getAccNumFromFA)
+export(getCaseStudyReport)
export(getProcessRuntimeWeights)
export(getTopAccByLinDomArch)
export(mapAcc2Name)
@@ -98,6 +99,18 @@ export(wordcloud3)
export(writeMSA_AA2FA)
export(writeProcessRuntime2TSV)
export(writeProcessRuntime2YML)
+import(Biostrings)
+import(DT)
+import(data.table)
+import(dplyr)
+import(httr)
+import(iprscanr)
+import(plotly)
+import(readr)
+import(rentrez)
+import(stringr)
+import(tidyverse)
+import(visNetwork)
importFrom(Biostrings,AAStringSet)
importFrom(Biostrings,readAAStringSet)
importFrom(Biostrings,toString)
diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R
new file mode 100644
index 00000000..ab496048
--- /dev/null
+++ b/R/MolEvolData_class.R
@@ -0,0 +1,719 @@
+# Author(s): Samuel Chen
+# Last modified: 2020
+
+#' @import Biostrings tidyverse
+
+setClass("MolEvolData",
+ slots = list(
+ df = "data.frame",
+ fasta_path = "character",
+ msa_path = "character",
+ ipr_path = "character",
+ queries = "character",
+ cln_path = "character",
+ domainSeqs = "character",
+ domain_ipr_path = "character"
+ )
+)
+setClass("queryData",
+ slots = list(
+ df = "data.frame",
+ fasta_path = "character",
+ msa_path = "character",
+ ipr_path = "character",
+ queries = "character",
+ cln_path = "character"
+ )
+)
+setClass("blastUpload",
+ slots = list(
+ df = "character",
+ seqs = "character"
+ )
+)
+setClass("iprUpload",
+ slots = list(
+ df = "character",
+ seqs = "character"
+ )
+)
+setClass("seqUpload",
+ slots = list(
+ seqs = "character"
+ )
+)
+
+combineFilesNopmap <- function(inpath, pattern, outpath,
+ delim = "\t", skip = 0,
+ col_names) {
+ source_files <- dir(path = inpath, pattern = pattern, recursive = T)
+
+ source_files_path <- paste0(inpath, source_files)
+
+ dt_list <- map(source_files_path, function(x) {
+ if (!grepl("query", x)) {
+ fread(x, sep = delim, skip = skip, fill = T) # , col.names = col_names)
+ }
+ })
+
+ combined <- rbindlist(dt_list, fill = T)
+
+ fwrite(combined, outpath, sep = "\t")
+
+ return(combined)
+}
+
+
+full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x",
+ "TaxID.x", "Lineage", "PcPositive", "PcIdentity",
+ "AlnLength", "SAccNum", "SAllSeqID", "Mismatch",
+ "GapOpen", "QStart", "QEnd", "QLength", "SStart",
+ "SEnd", "SLength", "EValue",
+ "BitScore", "PcPosOrig",
+ "Name", "ClusterID", "DomArch.Pfam",
+ "DomArch.SMART",
+ "DomArch.CDD", "DomArch.TIGRFAM",
+ "DomArch.Phobius",
+ "DomArch.Gene3D", "DomArch.TMHMM",
+ "DomArch.SignalP_EUK",
+ "DomArch.SignalP_GRAM_NEGATIVE",
+ "DomArch.SignalP_GRAM_POSITIVE",
+ "Description", "Length", "TaxID.y",
+ "Species.y", "SourceDB", "Completeness")
+
+
+processWrapperDir <- function(path, pinName, type = "full") {
+ if (type == "full") {
+ if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) {
+ query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+ query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+ query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+ query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+ query_data$Query <- query_data$AccNum
+ queries <- unique(query_data$Query)
+ cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+ query_data <- query_data %>%
+ mutate(QueryName = Name) %>%
+ select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>%
+ distinct(QueryName, .keep_all = TRUE)
+ query_wrapper_data <- new("queryData",
+ df = query_data, fasta_path = query_sequence_path,
+ msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path
+ )
+ com_blast_data <- read_tsv(path, "/blast_combined.tsv") %>% arrange(desc(PcPositive))
+ ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
+ queries <- unique(com_blast_data$Query)
+ com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+ com_blast_data <- com_blast_data %>% select(QueryName, everything())
+ wrapper_data <- new("MolEvolData",
+ df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path,
+ fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = ""
+ )
+
+ return(list(wrapper_data, query_wrapper_data))
+ } else {
+
+ # Get the query data
+ query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+ query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+ query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+ query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+ query_data$Query <- query_data$AccNum
+ # alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+ queries <- unique(query_data$Query)
+ cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+ query_data <- query_data %>%
+ mutate(QueryName = Name) %>%
+ select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>%
+ distinct(QueryName, .keep_all = TRUE)
+ query_wrapper_data <- new("queryData",
+ df = query_data, fasta_path = query_sequence_path,
+ msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path
+ )
+
+ query_data <- query_data %>% select(Query, QueryName)
+
+ com_blast_path <- paste0(path, "/blast_combined.tsv")
+ ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
+ com_blast_data <- combineFilesNopmap(paste0(path, "/"),
+ pattern = "*.full_analysis.tsv", skip = 0,
+ col_names = c(), outpath = com_blast_path, delim = "\t"
+ )
+
+ ipr_blast_data <- combineFilesNopmap(paste0(path, "/"),
+ pattern = "*.iprscan_cln.tsv", skip = 0,
+ col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t"
+ )
+ com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+ com_blast_data <- com_blast_data %>% arrange(desc(PcPositive))
+ fwrite(com_blast_data, com_blast_path, sep = "\t")
+
+ queries <- unique(com_blast_data$Query)
+
+ com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+ com_blast_data <- com_blast_data %>% select(QueryName, everything())
+
+ wrapper_data <- new("MolEvolData",
+ df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path,
+ fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = ""
+ )
+
+ return(list(wrapper_data, query_wrapper_data))
+ }
+ } else if (type == "dblast") {
+ query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+ query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+ query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+ query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+ alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+ cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+ query_data <- query_data %>% mutate(Query = AccNum)
+ query_data <- query_data %>%
+ mutate(QueryName = Name) %>%
+ select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+ Query, Name, AccNum) %>%
+ distinct(QueryName, .keep_all = TRUE)
+ queries <- unique(query_data$Query)
+ query_wrapper_data <- new("queryData",
+ df = query_data,
+ fasta_path = query_sequence_path,
+ msa_path = query_msa_path,
+ ipr_path = query_ipr_path,
+ queries = queries, cln_path = cln_path
+ )
+ query_data <- query_data %>% select(Query, QueryName)
+ com_blast_path <- paste0(path, "/blast_combined.tsv")
+ com_blast_data <- combineFilesNopmap(paste0(path, "/"),
+ pattern = "*.blast.cln.tsv",
+ skip = 0,
+ col_names = c(),
+ outpath = com_blast_path,
+ delim = "\t"
+ )
+ com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+ queries <- unique(com_blast_data$Query)
+ com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+ com_blast_data <- com_blast_data %>%
+ select(QueryName, everything()) %>%
+ arrange(desc(PcPositive))
+ wrapper_data <- new("MolEvolData",
+ df = com_blast_data, queries = queries,
+ ipr_path = "", cln_path = com_blast_path,
+ fasta_path = query_sequence_path,
+ msa_path = query_msa_path, domainSeqs = "",
+ domain_ipr_path = ""
+ )
+ return(list(wrapper_data, query_wrapper_data))
+ } else if (type == "phylo") {
+ query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+ query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+ query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+ cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+ query_data <- query_data %>% mutate(Query = AccNum)
+ query_data <- query_data %>%
+ mutate(QueryName = Name) %>%
+ select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+ Query, Name, AccNum) %>%
+ distinct(QueryName, .keep_all = TRUE)
+ queries <- unique(query_data$Query)
+ query_wrapper_data <- new("queryData",
+ df = query_data,
+ fasta_path = query_sequence_path,
+ msa_path = query_msa_path,
+ ipr_path = query_ipr_path,
+ queries = queries, cln_path = cln_path
+ )
+ wrapper_data <- new("MolEvolData",
+ df = query_data,
+ fasta_path = query_sequence_path,
+ msa_path = query_msa_path,
+ ipr_path = query_ipr_path,
+ queries = queries, cln_path = cln_path,
+ domainSeqs = "", domain_ipr_path = ""
+ )
+ return(list(wrapper_data, query_wrapper_data))
+ } else if (type == "da") {
+ query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+ query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+ query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+ query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+ alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+ cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+ query_data <- query_data %>%
+ mutate(QueryName = Name) %>%
+ select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+ Query, Name) %>%
+ distinct(QueryName, .keep_all = TRUE)
+ queries <- unique(query_data$Query)
+ query_wrapper_data <- new("queryData",
+ df = query_data,
+ fasta_path = query_sequence_path,
+ msa_path = query_msa_path,
+ ipr_path = query_ipr_path,
+ queries = queries, cln_path = cln_path
+ )
+ com_blast_path <- paste0(path, "/blast_combined.tsv")
+ com_blast_data <- combineFilesNopmap(paste0(path, "/"),
+ pattern = "*.full_analysis.tsv",
+ skip = 0,
+ col_names = c(),
+ outpath = com_blast_path,
+ delim = "\t"
+ )
+ com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+ com_blast_data <- com_blast_data %>%
+ select(QueryName, everything()) %>%
+ arrange(desc(PcPositive))
+ wrapper_data <- new("MolEvolData",
+ df = com_blast_data, queries = queries,
+ ipr_path = "", cln_path = com_blast_path,
+ fasta_path = query_sequence_path,
+ msa_path = query_msa_path,
+ domainSeqs = "", domain_ipr_path = ""
+ )
+ return(list(wrapper_data, query_wrapper_data))
+ } else {
+ stop("Unrecognized type. Please use 'full', 'dblast', 'phylo', or 'da'.")
+ }
+}
+
+drop_empty <- function(df) {
+ for (c in colnames(df)) {
+ if (all(is.na(df[, ..c])) || all(df[, ..c] == "")) {
+ df <- df %>% select(-contains(c))
+ }
+ }
+ return(df)
+}
+
+
+clean_fetched <- function(df) {
+ df <- as.data.frame(df)
+ # df = df %>% cleanup_species()
+
+ # Cleanup domarchs
+ cols <- colnames(df)
+ for (c in cols) {
+ if (grepl("^DomArch", c)) {
+ # Repeats
+ old <- c
+ new <- paste0(c, ".repeats")
+ df <- df %>% cleanup_domarch(
+ old = old, new = new,
+ domains_rename = NULL, # domains_rename,
+ domains_keep = NULL, # filter applied to only ClustName for now.
+ domains_ignore = NULL, # !! should check and remove soon!
+ repeat2s = FALSE,
+ remove_tails = F, # new! check below if it works!
+ remove_empty = F
+ )
+
+ new <- c
+ df <- df %>% cleanup_domarch(
+ old = old, new = new,
+ domains_rename = NULL, # domains_rename,
+ domains_keep = NULL, # filter applied to only ClustName for now.
+ domains_ignore = NULL, # !! should check and remove soon!
+ repeat2s = TRUE,
+ remove_tails = F, # new! check below if it works!
+ remove_empty = F
+ )
+ }
+ }
+
+
+ return(df)
+}
+
+# Description
+# validation functions for accession numbers/headers for FASTA (validateAccNumFasta)
+# & accession number input (validateAccNum) submission types.
+
+library(Biostrings)
+library(httr)
+library(httr2)
+library(rentrez)
+library(shiny)
+
+getAccNumFromFasta <- function(biostrings_aa_string_set, verbose = FALSE) {
+ # parsing/cleaning accession numbers using the same methods as
+ # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function
+ accnums <- c()
+ for (header in names(biostrings_aa_string_set)) {
+ # case 1 UnitProtKB formatted FASTA
+ # header: https://www.uniprot.org/help/fasta-headers
+ if (grepl("\\|", header)) {
+ accnum <- unlist(strsplit(header, "\\|"))[2]
+ # case 2 NCBI formatted FASTA
+ # header: https://www.ncbi.nlm.nih.gov/genbank/fastaformat
+ } else if (grepl(" ", header)) {
+ accnum <- unlist(strsplit(header, " "))[1]
+ # case 3 neither delimiter present; use the whole header
+ } else {
+ accnum <- header
+ }
+ # print debug info
+ if (verbose == TRUE) {
+ cat("header:", header, "\n")
+ cat("accnum_parsed:", accnum, "\n")
+ }
+ accnums <- append(accnums, accnum)
+ }
+ print(accnums)
+ return(accnums)
+}
+
+validateAccNumFasta <- function(text) {
+ # INPUT: text from input object that houses FASTA data
+ # validate the headers/accnums for FASTA submission
+ # Return:
+ # T: when no duplicate accnums after parsing
+ # F: when duplicate accnums are present after parsing
+
+ # TRY write tmp file for biostrings parsing
+ path <- tryCatch(
+ expr = {
+ path <- tempfile()
+ write(text, path)
+ path
+ },
+ error = function(e) {
+ NULL
+ }
+ )
+ if (is.null(path)) {
+ cat("failed to write temp fasta during accnum validation\n", file = stderr())
+ return(FALSE) # validation fail
+ }
+
+ # TRY read seq
+ fasta <- tryCatch(
+ expr = {
+ fasta <- Biostrings::readAAStringSet(path)
+ fasta
+ },
+ error = function(e) {
+ NULL
+ },
+ finally = {
+ unlink(path)
+ }
+ )
+ if (is.null(fasta)) {
+ cat("failed to read sequence during accnum validation\n", file = stderr())
+ return(FALSE) # validation fail
+ }
+
+ # parse accnums from fasta
+ accnums <- getAccNumFromFasta(fasta)
+ tb_accnum_counts <- tibble("frequencies" = table(accnums))
+ # check for duplicates
+ if (any(tb_accnum_counts$frequencies > 1)) {
+ cat("duplicate headers found during fasta validation\n", file = stderr())
+ return(FALSE) # validation fail
+ } else {
+ return(TRUE)
+ }
+}
+
+
+# Test whether a single accession returns a valid protein from Entrez
+isAccNumValidEntrez <- function(accnum, verbose = FALSE) {
+ # empty accnum wil not raise an error from efetch, so test for this first
+ if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
+
+ # try performing a POST of the accnum, followed by a GET request for a protein
+ # sequence
+ # rentrez::entrez_fetch() returns an error when there's no protein,
+ # so we're using try statements to handle this and return FALSE upon error
+ result <- tryCatch(
+ expr = {
+ result <- rentrez::entrez_fetch(db = "protein", id = accnum, rettype = "fasta")
+ if (verbose) {print(result)}
+ TRUE
+ },
+ error = function(e) {if (verbose) {print(e)}; FALSE}
+ )
+ return(result)
+}
+
+# Test whether a single accession returns a valid protein from EBI
+isAccNumValidEbi <- function(accnum, verbose = FALSE) {
+ # validation: ensure there's some text to parse
+ if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
+ # construct a httr2 request to POST an accession number, then GET the fasta
+ url_base <- "https://www.ebi.ac.uk/proteins/api/proteins?accession="
+ url_protein <- paste0(url_base, accnum)
+ req <- httr2::request(url_protein) |>
+ httr2::req_headers("Accept" = "text/x-fasta", "Content-type" = "application/x-www-form-urlencoded")
+
+ # wrap in try since a failed HTTP request will raise an R error
+ try(httr2::req_perform(req))
+ # get the HTTP response code
+ resp <- httr2::last_response()
+ # assign result based on HTTP response code ('200' is a success)
+ result <- ifelse(resp$status == 200, TRUE, FALSE)
+ if (verbose) {
+ msg <- stringr::str_glue(
+ "EBI protein query results:\n",
+ "\taccnum: {accnum}\t validation result: {result}\n"
+ ) |> print()
+ }
+ return(result)
+}
+
+# Perform a series of API reqs using NCBI entrez to validate accession numbers
+performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+ # API guidelines docs
+ # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+ # entrez recommends no more than 3 POSTs per second
+ # simple method, sleep for 1 second after every 3rd POST
+ i <- 0
+ results <- vapply(
+ X = accnums,
+ FUN = function(accnum) {
+ result <- isAccNumValidEntrez(accnum, verbose = TRUE)
+ i <<- i + 1L
+ if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L}
+ if (track_progress) {incProgress(1)}
+ result
+ },
+ FUN.VALUE = logical(1)
+ )
+ return(results)
+}
+
+performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+ # API guidelines docs
+ # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+ # EBI allows 200 POSTs per second
+ # simple method, sleep for 1 second after every 200th POST
+ i <- 0
+ results <- vapply(
+ X = accnums,
+ FUN = function(accnum) {
+ result <- isAccNumValidEbi(accnum, verbose = TRUE)
+ i <<- i + 1L
+ if (i >= 200L) {Sys.sleep(1); i <<- 0L}
+ if (track_progress) incProgress(1)
+ result
+ },
+ FUN.VALUE = logical(1)
+ )
+ return(results)
+}
+
+# Validate accession numbers from MolEvolvR user input
+validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) {
+ # API guidelines docs
+ # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
+ # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+
+ # get accnums from three possible delimiters
+ # 1. comma separated values with any amount of space between them
+ # 2. one or more space delimiter
+ # 3. "\n" (newline) delimiter
+ accnums <- unlist(strsplit(text, "\\s*,\\s*|\\s+|\\n|,"))
+
+ warning()
+
+ if (track_progress) {incProgress(1 / n_steps,
+ message = "Testing for duplicate accession numbers . . .")}
+ # fail fast for duplicates
+ if (any(duplicated(accnums))) {
+ warning("duplicate accesion numbers found")
+ if (track_progress) {setProgress(n_steps,
+ message = "Duplicate accession numbers found")}
+ return(FALSE)
+ }
+ if (track_progress) {incProgress(1 / n_steps,
+ message = "Please wait (searching NCBI's protein database) . . .")}
+ # entrez API
+ entrez_results <- performEntrezReqs(accnums, verbose = verbose)
+ if (verbose) {
+ stringr::str_glue("rentrez
+ results:\n\t{paste0(entrez_results, collapse=",")}\n") |>
+ print()
+ }
+
+ # EBI API
+ # if all of entrez resulted in success, then skip EBI. Else, try EBI
+ if (!all(entrez_results)) {
+ ebi_results <- performEbiReqs(accnums, verbose = verbose)
+ if (verbose) {
+ stringr::str_glue("ebi results:\n\t{paste0(ebi_results,
+ collapse=",")}\n") |>
+ print()
+ }
+ if (track_progress) {incProgress(1 / n_steps, message = "Please wait
+ (searching EBI's protein database) . . .")}
+ # OR test each accnum result across both dbs
+ final_results <- ebi_results | entrez_results
+ } else {
+ if (track_progress) {setProgress(n_steps, message = "All accession
+ numbers validated")}
+ final_results <- entrez_results
+ }
+ failed_accnums <- accnums[which(!final_results)]
+ if (length(failed_accnums) >= 1) {
+ msg <- stringr::str_glue(
+ "The following accnums failed validation:\n",
+ "\t{paste0(failed_accnums, collapse=",")}\n"
+ )
+ warning(msg)
+ if (track_progress) {setProgress(n_steps, message = "Accession number
+ validation failed")}
+ }
+ setProgress(n_steps, message = "Accession number validation finished")
+ return(
+ list(
+ "validation_result" = all(final_results),
+ "failed_accnums" = failed_accnums
+ )
+ )
+}
+
+#===============================================================================
+# Description
+# validation function for evalue input
+#===============================================================================
+validateEvalue <- function(input_value) {
+ is_valid_evalue <- is.numeric(input_value) && input_value != 0
+ return(is_valid_evalue)
+}
+
+#===============================================================================
+# Author(s): JK
+# Last modified: 2023_06
+#===============================================================================
+#-------------------------------------------------------------------------------
+guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) {
+ tb <- as_tibble(alphabetFrequency(single_fasta))
+ n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0
+
+ aa_cols <- setdiff(Biostrings::AA_ALPHABET, c("*", "."))
+ tb_dna <- tb %>% select(all_of(Biostrings::DNA_ALPHABET))
+ tb_aa <- tb %>% select(all_of(aa_cols))
+
+ total <- nchar(single_fasta)
+ est_dna_prop <- sum(unlist(tb_dna)) / total
+ est_aa_prop <- sum(unlist(tb_aa)) / total
+ other_prop <- n_other / total
+ cat(
+ names(single_fasta), "\n",
+ "estimated DNA alphabet proportion:", est_dna_prop, "\n",
+ "estimated AA alphabet proportion:", est_aa_prop, "\n",
+ "'other' alphabet proportion:", other_prop, "\n"
+ )
+
+ if (est_dna_prop >= dna_guess_cutoff) {
+ guess <- "DNA"
+ } else if (other_prop >= other_guess_cutoff) {
+ guess <- NA
+ } else {
+ guess <- "AA"
+ }
+ cat("Guess:", guess, "\n")
+ return(guess)
+}
+
+validateSeqBody <- function(text) {
+ # convert string to single letter character vector
+ individual_chars <- unlist(strsplit(text, ""))
+ # return the characters from input that are NOT in the AA_ALPHABET
+ invalid_chars <- setdiff(individual_chars, Biostrings::AA_ALPHABET)
+ n_invalid_chars <- length(invalid_chars)
+
+ # 0 length character vector means the sequence contains
+ # characters that are all present in the AA_ALPHABET
+ # >0 length character vector means there are some characters
+ # which are NOT found in the AA_ALPHABET
+ if (n_invalid_chars == 0) {
+ return(TRUE)
+ } else {
+ warning("A sequence contains characters not found in the AA_ALPHABET")
+ return(FALSE)
+ }
+}
+
+#-------------------------------------------------------------------------------
+validateFasta <- function(fasta_path, .type = "AA") {
+ # handle case of fasta being unreadable/unrecognized format
+ fasta <- tryCatch(
+ expr = Biostrings::readAAStringSet(fasta_path),
+ error = function(e) {
+ warning(paste0("error: failed to read sequence from: ", fasta_path))
+ return(NULL)
+ }
+ )
+ # failure to read yields quick return
+ if (is.null(fasta)) {
+ return(FALSE) # validation fail
+ }
+
+ # require at least some characters for the sequence
+ if (!(sum(nchar(unlist(fasta))) >= 1)) {
+ warning(paste0("fasta does not have any sequence characters"))
+ return(FALSE) # validation fail
+ }
+ print(fasta)
+
+ ### validate sequence BODY
+ # iteratively return boolean value for valid seq body (FALSE = invalid)
+ seq_body_validations <- c()
+ for (i in seq_along(1:length(fasta))) {
+ seq_body <- as.character(fasta[i])
+ seq_body_validations <- c(validateSeqBody(seq_body), seq_body_validations)
+ }
+ # require all sequences to have chars only in AA_ALPHABET
+ if (!(all(seq_body_validations))) {
+ warning("At least one sequence has non-IUPAC AA characters")
+ return(FALSE) # validation fail
+ }
+
+ ### validate sequence TYPE
+ # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file)
+ seq_types <- c()
+ for (idx in seq_along(1:length(fasta))) {
+ seq_types <- c(guessSeqType(fasta[idx]), seq_types)
+ }
+
+ switch(.type,
+ "DNA" = {
+ anti_type <- "AA"
+ },
+ "AA" = {
+ anti_type <- "DNA"
+ }
+ )
+
+ # if all seqs are of desired .type, only then TRUE
+ if (sum(is.na(seq_types)) != 0) {
+ return(FALSE) # validation fail
+ } else if (anti_type %in% seq_types) {
+ return(FALSE) # validation fail
+ } else {
+ return(TRUE) # validation success
+ }
+}
+
+# Author(s): Samuel Chen, Lo Sosinski, Joe Burke
+# Last modified: 2021
+
+alpha_numeric <- c(0:9, letters, LETTERS)
+
+rand_string <- function(length, characters = alpha_numeric, post_fix = "", ignorelist = c()) {
+ str <- NULL
+
+ while (is.null(str) || (str %in% ignorelist)) {
+ chars <- sample(characters, size = length, replace = TRUE)
+ str <- paste0(paste(chars, collapse = ""), post_fix)
+ }
+
+ return(str)
+}
+strsplit_vect <- function(strings, pattern = "", pos) {
+ split_strings <- map(strsplit(strings, "_"), function(x) x[1]) %>% unlist()
+
+ return(split_strings)
+}
diff --git a/R/combine_files.R b/R/combine_files.R
index 4f03b1d2..1c5ac6a2 100755
--- a/R/combine_files.R
+++ b/R/combine_files.R
@@ -24,22 +24,22 @@
#'
#' @author Janani Ravi
#'
-#' @param inpath Character. The master directory path where the files reside.
+#' @param inpath Character. The master directory path where the files reside.
#' The search is recursive (i.e., it will look in subdirectories as well).
-#' @param pattern Character. A search pattern to identify files to be combined.
+#' @param pattern Character. A search pattern to identify files to be combined.
#' Default is "*full_analysis.tsv".
-#' @param delim Character. The delimiter used in the input files.
-#' Default is tab ("\t").
-#' @param skip Integer. The number of lines to skip at the beginning of each file.
+#' @param delim Character. The delimiter used in the input files.
+#' Default is tab \code{"\t"}.
+#' @param skip Integer. The number of lines to skip at the beginning of each file.
#' Default is 0.
-#' @param col_names Logical or character vector. If TRUE, the first row of each file
-#' is treated as column names. Alternatively, a character vector can
+#' @param col_names Logical or character vector. If TRUE, the first row of each file
+#' is treated as column names. Alternatively, a character vector can
#' be provided to specify custom column names.
#'
#' @importFrom purrr pmap_dfr
#' @importFrom readr cols
#'
-#' @return A data frame containing the combined contents of all matched files.
+#' @return A data frame containing the combined contents of all matched files.
#' Each row will include a new column "ByFile" indicating the source file of the data.
#'
#' @export
diff --git a/R/generate_report.R b/R/generate_report.R
new file mode 100644
index 00000000..65b8364d
--- /dev/null
+++ b/R/generate_report.R
@@ -0,0 +1,740 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# import libs
+#' @import readr stringr
+
+#' @export
+# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
+getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
+ cat("\n")
+ cat("\033[1;36m") # Cyan for the "MolEvolvR" header
+ cat("=========================================\n")
+ cat(" M o l E v o l R\n")
+ cat("=========================================\n")
+ cat("\033[0m")
+
+ cat("\033[1;32m")
+ cat("\n>>>>> Case Study Report Generation started... <<<<<\n")
+ cat("\033[0m")
+
+ # Step 1: Fetch the FASTA sequences of the given pathogen and drug
+ getCardData(pathogen, drug)
+
+ # Check if the FASTA file exists after fetching
+ fasta_file <- "filtered_proteins.fasta"
+ if (!file.exists(fasta_file)) {
+ stop("Failed to retrieve FASTA sequences. The file
+ 'filtered_proteins.fasta' does not exist.")
+ }
+ # Step 2: Once the FASTA file is ready, call runAnalysis to run pipeline
+ message("FASTA file downloaded, running analysis...")
+
+ # Pass the file to runAnalysis function
+ runAnalysis(file_paths = list(fasta = fasta_file), ...)
+
+ message("Case study report completed")
+}
+
+# get fasta of pathogen and/or drug
+getCardData <- function(pathogen = NULL, drug = NULL) {
+ destination_dir <- "CARD_data"
+ # Check if CARD data exists
+ if (!dir.exists(destination_dir)) {
+ dir.create(destination_dir)
+ }
+ if (!file.exists("CARD_data/aro_index.tsv")) {
+ # Step 1: Download CARD Data
+ download.file("https://card.mcmaster.ca/latest/data", "card_data.tar.bz2")
+ #unzip("card_data.tar.bz2", exdir = "CARD_data")
+ system(paste("tar -xjf card_data.tar.bz2 -C", destination_dir))
+ }
+
+ # Step 2: Open ARO_index.tsv
+ aro_data <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE)
+
+ # Step 3: Map CARD Short Name
+ antibiotics <- read.delim("CARD_data/shortname_antibiotics.tsv",
+ sep = "\t", header = TRUE)
+ pathogens <- read.delim("CARD_data/shortname_pathogens.tsv",
+ sep = "\t", header = TRUE)
+
+ aro_data <- aro_data %>%
+ mutate(
+ pathogen = str_extract(CARD.Short.Name, "^[^_]+"),
+ gene = str_extract(CARD.Short.Name, "(?<=_)[^_]+"),
+ drug = str_extract(CARD.Short.Name, "(?<=_)[^_]+$")
+ ) %>%
+ left_join(pathogens, by = c("pathogen" = "Abbreviation")) %>%
+ left_join(antibiotics, by = c("drug" = "AAC.Abbreviation"))
+
+ # Sort names
+ aro_data <- aro_data %>%
+ arrange(Pathogen, Molecule) %>%
+ group_by(Pathogen, Molecule)
+
+ # Filter data based on user input
+ filtered_data <- aro_data
+
+ if (!is.null(pathogen)) {
+ filtered_data <- filtered_data %>% filter(Pathogen == !!pathogen)
+ }
+
+ if (!is.null(drug)) {
+ filtered_data <- filtered_data %>% filter(Molecule == !!drug)
+ }
+
+ # Check if filtered data is empty
+ if (nrow(filtered_data) == 0) {
+ stop("No data found for the specified pathogen and drug.")
+ }
+
+ # Extract protein accessions
+ accessions <- filtered_data$Protein.Accession
+
+ # Function to fetch FASTA sequence from NCBI
+ get_fasta <- function(accession) {
+ entrez_fetch(db = "protein", id = accession, rettype = "fasta")
+ }
+
+ # Download sequences
+ fasta_sequences <- lapply(accessions, get_fasta)
+
+ # Write sequences to a file
+ writeLines(unlist(fasta_sequences), "filtered_proteins.fasta")
+
+ return("FASTA sequences downloaded to 'filtered_proteins.fasta'.")
+}
+
+# Run analysis
+runAnalysis <- function(
+ upload_type = "Fasta",
+ evalue = 0.00001,
+ accnum_fasta_input = "",
+ file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(),
+ blast = NULL, iprscan = NULL),
+ blast_db = "refseq_protein",
+ blast_hits = 100,
+ blast_eval = 0.0001,
+ acc_homology_analysis = TRUE,
+ acc_da_analysis = TRUE,
+ acc_phylogeny_analysis = FALSE,
+ report_template_path = system.file("report", "report_template.Rmd", package = "MolEvolvR", mustWork = TRUE),
+ output_file = file.path(getwd(), "report.html"),
+ DASelect = "All",
+ mainSelect = NULL,
+ PhyloSelect = "All",
+ q_heatmap_select = "All",
+ DACutoff = 95,
+ GCCutoff = 0.5,
+ query_select = NULL,
+ query_iprDatabases = c(
+ "PfamA", "SMART", "Phobius",
+ "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE",
+ "SuperFamily", "MobiDBLite", "Panther", "Coils"
+ ),
+ query_iprVisType = "Analysis", tree_msa_tool = "ClustalO",
+ levels = 2,
+ DA_Col = "DomArch.PfamA",
+ msa_rep_num = 10,
+ msa_reduce_by = "Species",
+ rval_phylo = FALSE,
+ DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"),
+ networkLayout = c("nice", "grid", "circle", "random"),
+ phylo_select = "All",
+ ...
+
+) {
+
+ ##### Initialize Variables and Classes #####
+ blast_upload_data <- new("blastUpload", df = "", seqs = "")
+ ipr_upload_data <- new("iprUpload", df = "", seqs = "")
+ sequence_upload_data <- new("seqUpload", seqs = "")
+ data <- new("MolEvolData", msa_path = tempfile(),
+ fasta_path = file_paths$fasta)
+ app_data <- new("MolEvolData", msa_path = tempfile(),
+ fasta_path = file_paths$fasta)
+ query_data <- new("queryData")
+
+ if (length(upload_type) != 1) {
+ stop("upload_type must be a single value.")
+ }
+
+ # Reset default analysis function
+ resetSettings <- function() {
+ acc_homology_analysis <<- FALSE
+ acc_da_analysis <<- FALSE
+ acc_phylogeny_analysis <<- FALSE
+ domain_split <<- FALSE
+ }
+
+ fasta_set <- c("Fasta", "AccNum", "MSA")
+
+ # Update settings based on upload type
+ updateUploadType <- function(upload_type) {
+ switch(upload_type,
+ "Fasta" = {
+ resetSettings()
+ acc_homology_analysis <<- TRUE
+ acc_da_analysis <<- TRUE
+ },
+ "AccNum" = {
+ resetSettings()
+ acc_homology_analysis <<- TRUE
+ acc_da_analysis <<- TRUE
+ },
+ "MSA" = {
+ resetSettings()
+ acc_homology_analysis <<- TRUE
+ acc_da_analysis <<- TRUE
+ },
+ "BLAST Output" = {
+ resetSettings()
+ acc_phylogeny_analysis <<- TRUE
+ },
+ "InterProScan Output" = {
+ resetSettings()
+ acc_da_analysis <<- TRUE
+ }
+ )
+ }
+
+ updateUploadType(upload_type)
+
+ ####### File Upload Functions ########
+
+ # Update function to return modified object
+ uploadAccNumFile <- function(file_path, seq_obj) {
+ accnum_data <- read_file(file_path)
+ seq_obj@seqs <- accnum_data
+ return(seq_obj)
+ }
+
+ uploadFastaFile <- function(file_path, seq_obj) {
+ fasta_data <- read_file(file_path)
+ seq_obj@seqs <- fasta_data
+ return(seq_obj)
+ }
+
+ uploadMSAFile <- function(file_path, seq_obj) {
+ msa_data <- read_file(file_path)
+ seq_obj@seqs <- msa_data
+ return(seq_obj)
+ }
+
+ uploadBlastFile <- function(file_path, blast_obj) {
+ blast_obj@df <- file_path
+ return(blast_obj)
+ }
+
+ uploadIPRScanFile <- function(file_path, ipr_obj) {
+ ipr_obj@df <- file_path
+ return(ipr_obj)
+ }
+
+ # Now modify the calling section to reassign the modified objects
+ if (!is.null(file_paths$accnum)) {
+ sequence_upload_data <- uploadAccNumFile(file_paths$accnum,
+ sequence_upload_data)
+ }
+ if (!is.null(file_paths$fasta)) {
+ sequence_upload_data <- uploadFastaFile(file_paths$fasta,
+ sequence_upload_data)
+ }
+ if (!is.null(file_paths$msa)) {
+ sequence_upload_data <- uploadMSAFile(file_paths$msa,
+ sequence_upload_data)
+ }
+ if (!is.null(file_paths$blast)) {
+ blast_upload_data <- uploadBlastFile(file_paths$blast,
+ blast_upload_data)
+ }
+ if (!is.null(file_paths$iprscan)) {
+ ipr_upload_data <- uploadIPRScanFile(file_paths$iprscan,
+ ipr_upload_data)
+ }
+
+ # Validation of inputs
+ fasta_data <- read_file(file_paths$fasta)
+ validate_and_process_inputs <- function(evalue, fasta_data) {
+ if (!validateEvalue(evalue)) {
+ return("Error: A numeric E-value is required. Please set a valid
+ value (e.g., 0.0001).")
+ }
+
+
+ if (!validateAccNumFasta(fasta_data)) {
+ return("Error: Input for AccNum/Fasta cannot be empty or invalid.")
+ }
+
+ sequence_vector <- unlist(strsplit(fasta_data, "\n"))
+ return(list(message = "Inputs are valid!", sequences = sequence_vector))
+ }
+
+ validation_result <- validate_and_process_inputs(evalue, fasta_data)
+ if (is.character(validation_result)) {
+ return(validation_result)
+ }
+
+ # Phylogenetic Analysis Validation
+ phylo <- acc_phylogeny_analysis
+ if (phylo) {
+ # Validate phylogenetic analysis based on upload type
+ is_valid_phylo <- switch(
+ upload_type,
+ "Fasta" = {
+ str_count(string = sequence_upload_data@seqs, ">") > 1
+ },
+ "AccNum" = {
+ accnums <- sequence_upload_data@seqs |>
+ strsplit("\\s*,\\s*|\\s+|\\n|,") |>
+ unlist()
+ length(accnums) > 1
+ },
+ "MSA" = {
+ str_count(string = sequence_upload_data@seqs, ">") > 1
+ },
+ FALSE # Default to FALSE if no valid type is found
+ )
+
+ if (!is_valid_phylo) {
+ return("Error: At least two sequences/identifiers are required for
+ phylogenetic analysis.")
+ }
+ }
+ # Fasta-like submissions
+ if (upload_type %in% fasta_set) {
+ # Can have any combination of select options
+ type <- ""
+ script <- ""
+ postfix <- ""
+
+ if (acc_homology_analysis && !phylo && acc_da_analysis) {
+ type <- "full"
+ postfix <- "full"
+ }
+ # Phylogenetic analysis, do full script but skip blast
+ else if (phylo && acc_da_analysis) {
+ type <- "phylo"
+ postfix <- "phylo"
+ }
+ else if (acc_da_analysis) {
+ type <- "da"
+ postfix <- "da"
+ }
+ else if (acc_homology_analysis) {
+ # Only run BLAST
+ type <- "dblast"
+ postfix <- "dblast"
+ }
+ else {
+ # Something went wrong, throw error
+ stop("Please select one of the above analyses (full, phylo, da,
+ dblast) to run.")
+
+ return()
+ }
+ }
+
+ # After uploading the sequence data, you would check the uploaded data
+ if (sequence_upload_data@seqs == "") {
+ stop("Error: Please upload a protein sequence.")
+ }
+ OUT_PATH <- getwd()
+ unavailable_pins <- list.files(OUT_PATH)
+ unavailable_pins <- strsplit_vect(unavailable_pins, pattern = "_")
+ pinName <- rand_string(length = 6, post_fix = "", ignorelist = unavailable_pins)
+ pin_id <- strtrim(pinName, 6)
+
+ dir <- paste0(OUT_PATH, pin_id, "_", postfix)
+ path <- paste0(dir, "/", pin_id, ".fa")
+ # Adding validation logic based on upload type
+ system(paste0("mkdir ", dir), wait = TRUE)
+ switch(upload_type,
+ "Fasta" = {
+ # Validate sequence limit
+ if (str_count(sequence_upload_data@seqs, ">") > 200) {
+ stop("Error: Only submissions with less than 200 proteins
+ are accepted at this time. For analyses with more than 200 p
+ roteins please contact janani.ravi@cuanschutz.edu.")
+ }
+
+ # Validate accession numbers for FASTA submission
+ if (!(validateAccNumFasta(sequence_upload_data@seqs))) {
+ stop("Error: Please adjust the FASTA headers. Ensure a header
+ line for each sequence, no duplicate header names, and
+ no duplicate protein accession numbers.")
+ }
+
+ # TRY load fasta for fasta validation
+ is_valid_aa_fasta <- tryCatch(
+ expr = {
+ tmp_file <- tempfile()
+ writeLines(sequence_upload_data@seqs, tmp_file)
+ validateFasta(tmp_file)
+ },
+ error = function(e) {
+ warning("Error: Failed to run input FASTA verification.")
+ return(FALSE) # Return FALSE if an error occurs
+ },
+ finally = {
+ unlink(tmp_file)
+ }
+ )
+
+ # Validate AA fasta
+ if (!(is_valid_aa_fasta)) {
+ stop("Error: The FASTA input could not be recognized as valid
+ Amino Acid (AA) sequence(s). MolEvolvR only accepts FASTA
+ formatted AA/protein sequences as input (not DNA/RNA).
+ For a short reference on FASTA format, review this
+ https://blast.ncbi.nlm.nih.gov/doc/blast-topics/short NCBI guide.")
+ }
+
+ write(sequence_upload_data@seqs, path)
+ },
+ "AccNum" = {
+ # Convert Acc to FASTA
+ accnum_vect <- unlist(strsplit(sequence_upload_data@seqs, "\\s*,\\s*|\\s+|\\n|,"))
+ if (length(accnum_vect) > 200) {
+ stop("Error: Only submissions with less than 200 proteins are
+ accepted at this time. For analyses with more than 200
+ proteins please contact janani.ravi@cuanschutz.edu.")
+ }
+
+ # if an error is raised, validation fails
+ validation_results <- purrr::map_lgl(
+ accnum_vect,
+ function(accnum) {
+ tryCatch(
+ expr = {
+ tmp <- tempfile(
+ pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"),
+ fileext = ".fa"
+ )
+ acc2FA(accnum, tmp)
+ readAAStringSet(tmp)
+ TRUE
+ },
+ error = function(e) {
+ FALSE
+ },
+ finally = {
+ suppressWarnings(sink())
+ unlink(tmp)
+ }
+ )
+ }
+ )
+
+ # If any accession numbers fail, halt submission
+ if (!all(validation_results)) {
+ stop("Error: MolEvolvR could not locate sequences for the
+ following accession numbers:
+ [ {paste0(accnum_vect[which(!validation_results)], collapse = ', ')} ]
+ Please try submitting FASTA sequences instead.")
+ } else {
+ # Write a multifasta
+ acc2FA(accnum_vect, outpath = path)
+ }
+ },
+ "MSA" = {
+ # Validate sequence limit
+ if (str_count(sequence_upload_data@seqs, ">") > 200) {
+ stop("Error: Only submissions with less than 200 proteins
+ are accepted at this time. For analyses with more than 200
+ proteins please contact janani.ravi@cuanschutz.edu.")
+ }
+
+ msa_temp <- str_replace_all(sequence_upload_data@seqs, "-", "")
+ write(msa_temp, path)
+ }
+ )
+
+ phylo <- if_else(phylo, "TRUE", "FALSE")
+
+ # Clean FASTA headers
+ fasta <- Biostrings::readAAStringSet(filepath = path)
+ headers_original <- names(fasta)
+ headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x))
+ fasta <- cleanFAHeaders(fasta)
+ headers_cleaned <- names(fasta)
+
+ # Write a table to map original accnums to their cleaned version
+ readr::write_tsv(
+ x = tibble::tibble(
+ "header_original" = headers_original,
+ "header_accnum" = headers_accnum,
+ "header_clean" = headers_cleaned
+ ),
+ file = file.path(dir, "query-fasta_header-map.tsv")
+ )
+
+ # Remove '*' characters which are incompatible with interproscan
+ fasta <- gsub(pattern = "\\*", replacement = "X",
+ x = fasta) |> Biostrings::AAStringSet()
+ Biostrings::writeXStringSet(x = fasta, filepath = path)
+
+ if (domain_split) {
+ submit_split_by_domain(
+ dir = dir,
+ sequences = path,
+ DB = blast_db,
+ NHITS = blast_hits,
+ EVAL = blast_eval,
+ phylo = phylo,
+ type = type,
+ job_code = pin_id,
+ )
+ } else {
+ runFull(
+ dir = dir,
+ sequences = path,
+ DB = blast_db,
+ NHITS = blast_hits,
+ EVAL = blast_eval,
+ phylo = phylo,
+ type = type,
+ job_code = pin_id,
+ APPLICATION = query_iprDatabases
+ )
+ }
+
+ # Additional handling for different upload types
+ if (upload_type == "BLAST Output") {
+ dir <- paste0(OUT_PATH, pin_id, "_blast")
+ if (blast_upload_data@df == "") {
+ stop("Error: Please upload a BLAST file.")
+ }
+ if (blast_upload_data@seqs == "" && !blast_ncbi_check) {
+ stop("Error: Please provide a file containing sequences or check
+ the box to use fetch sequences for NCBI accession numbers.")
+ }
+
+ if (tools::file_ext(blast_upload_data@df) == "tsv" || blast_upload_data@df == EX_BLASTOUTPUT) {
+ data <- read_tsv(blast_upload_data@df, col_names = web_blastp_hit_colnames)
+ } else {
+ data <- read_csv(blast_upload_data@df, col_names = web_blastp_hit_colnames)
+ }
+
+ if (nrow(data) > 5005) {
+ stop("Error: BLAST output submissions are limited to 5000 proteins
+ at the moment. For analyses with more than 5000 proteins please
+ contact janani.ravi@cuanschutz.edu.")
+ }
+
+ system(paste0("mkdir ", dir), wait = TRUE)
+ blast_path <- paste0(dir, "/", pin_id, ".wblast.tsv")
+ write_tsv(data, blast_path, col_names = FALSE)
+
+ queries <- data$Query %>% unique()
+ writeLines(queries, paste0(dir, "/", "accs.txt"))
+
+ if (blast_ncbi_check) {
+ submit_blast(
+ dir = dir,
+ blast = paste0(dir, "/", pin_id, ".wblast.tsv"),
+ seqs = paste0(dir, "/", "seqs.fa"),
+ ncbi = TRUE,
+ job_code = pin_id,
+ submitter_email = notify_email,
+ advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist())
+ )
+ } else {
+ seqs <- read_file(blast_upload_data@seqs)
+ writeLines(seqs, paste0(dir, "/", "seqs.fa"))
+ submit_blast(
+ dir = dir,
+ blast = paste0(dir, "/", pin_id, ".wblast.tsv"),
+ seqs = paste0(dir, "/", "seqs.fa"),
+ ncbi = FALSE,
+ job_code = pin_id,
+ submitter_email = notify_email,
+ advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist())
+ )
+ }
+
+ } else if (upload_type == "InterProScan Output") {
+ dir <- paste0(OUT_PATH, pin_id, "_ipr")
+ path <- paste0(dir, "/", pin_id, "_ipr.tsv")
+ if (ipr_upload_data()@df == "") {
+ stop("Error: Please upload an interproscan file.")
+ }
+ if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) {
+ stop("Error: Please provide a file containing sequences or check the
+ box to fetch sequences using NCBI accession numbers.")
+ }
+
+ ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE)
+ if (nrow(ipr) > 200) {
+ stop("Error: Only submissions with less than 200 proteins are
+ accepted at this time. For analyses with more than 200 proteins
+ please contact janani.ravi@cuanschutz.edu.")
+ }
+
+ system(paste0("mkdir ", dir), wait = TRUE)
+ writeLines("AccNum\tSeqMD5Digest\tSLength\tAnalysis\tDB.ID\tSignDesc\tStartLoc\tStopLoc\tScore\tStatus\tRunDate\tIPRAcc\tIPRDesc\tGOTerms\tExtra\n", path)
+ write_tsv(ipr, path, col_names = FALSE, append = TRUE)
+
+ ncbi <- if_else(ipr_ncbi_check, TRUE, FALSE)
+ blast <- if_else(acc_homology_analysis, TRUE, FALSE)
+
+ if (ncbi) {
+ submit_ipr(
+ dir = dir,
+ ipr = path,
+ blast = acc_homology_analysis,
+ seqs = paste0(dir, "/", "seqs.fa"),
+ ncbi = TRUE,
+ DB = blast_db,
+ NHITS = blast_hits,
+ EVAL = blast_eval,
+ )
+ } else {
+ seqs <- read_file(ipr_upload_data()@seqs)
+ writeLines(seqs, paste0(dir, "/", "seqs.fa"))
+ submit_ipr(
+ dir = dir,
+ ncbi = FALSE,
+ ipr = path,
+ blast = acc_homology_analysis,
+ seqs = paste0(dir, "/", "seqs.fa"),
+ DB = blast_db,
+ NHITS = blast_hits,
+ EVAL = blast_eval,
+
+ )
+ }
+
+ }
+
+ # Process results for results
+ fetched <- processWrapperDir(dir, pinName = pinName, type = type)
+
+ # Assign fetched data to objects if the fetched list has the expected length
+ if (length(fetched) == 2) {
+ data <- fetched[[1]] # Assign the first element to 'data'
+ app_data <- fetched[[1]] # Assign the same to 'app_data'
+ query_data <- fetched[[2]] # Assign the second element to 'query_data'
+
+ r_nrow_initial <- nrow(fetched[[1]]@df) # Initialize row count
+ }
+
+ domarch_cols_value <- getDomArchCols(app_data, DASelect)
+
+ query_domarch_cols_value <- getQueryDomArchCols(query_data@df)
+
+ mainTable_value <- getDataTable(data)
+
+ queryDataTable_value <- getQueryDataTable(query_data, query_select)
+
+ fastaDataText_value <- getFastaData(query_data@fasta_path)
+
+ domainDataText_value <- getDomData(data)
+
+ msaDataText_value <- getMSAData(query_data@msa_path)
+
+ rs_IprGenes_value <- getIPRGenesVisualization(data,
+ app_data,
+ query_iprDatabases,
+ query_iprVisType)
+
+ rs_network_layout_value <- getRSNetworkLayout(data,
+ app_data,
+ cutoff = 100,
+ layout = "nice")
+
+ rs_data_table_value <- getDataTable(data)
+
+ da_IprGenes_value <- getDomArchIPRGenesPlot(app_data,
+ query_iprDatabases,
+ query_iprVisType,
+ DASelect)
+
+ query_heatmap_value <- getQueryHeatmap(query_data@df,
+ heatmap_select = "All",
+ heatmap_color = "blue")
+
+ DA_Prot_value <- getDomArchProt(app_data, DASelect)
+
+ DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.PfamA",
+ DACutoff,
+ DA_Prot_value,
+ DA_lin_color = "viridis",
+ app_data@ipr_path)
+
+ DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value,
+ DACutoff,
+ DA_col = "DomArch.PfamA",
+ app_data)
+
+ DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.PfamA",
+ app_data@ipr_path,
+ DALin_TotalCounts_value)
+
+ DANetwork_value <- getDomNetwork(DA_col = "DomArch.PfamA",
+ DACutoff,
+ DA_Prot_value,
+ networkLayout = "nice",
+ app_data@ipr_path)
+
+ rep_accnums_value <- repAccNums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data)
+
+ phylogeny_prot_value <- filterPhylogenyProteins(app_data, phylo_select)
+
+ acc_to_name_value <- acc2Name(app_data)
+
+ ####### Report Generation ########
+
+ tryCatch({
+ tempReport <- file.path(dir, "report.Rmd")
+ file.copy(report_template_path, tempReport, overwrite = TRUE)
+
+ # List of graphics to include in report
+ params <- list(
+ rs_interproscan_visualization = rs_IprGenes_value,
+ proximity_network = rs_network_layout_value,
+ sunburst = data@df,
+ data = rs_data_table_value,
+ queryDataTable = queryDataTable_value,
+ fastaDataText = fastaDataText_value,
+ heatmap = query_heatmap_value,
+ query_data = query_data,
+ query_domarch_cols = query_domarch_cols_value,
+ query_iprDatabases = query_iprDatabases,
+ query_iprVisType = query_iprVisType,
+ mainTable = mainTable_value,
+ DALinTable = DALinTable_value,
+ DALinPlot = DALinPlot_value,
+ DANetwork = DANetwork_value,
+ DA_Prot = DA_Prot_value,
+ domarch_cols = domarch_cols_value,
+ DA_Col = DA_Col,
+ DACutoff = DACutoff,
+ da_interproscan_visualization = da_IprGenes_value,
+ phylo_sunburst_levels = levels,
+ phylo_sunburst = phylogeny_prot_value,
+ tree_msa_tool = tree_msa_tool,
+ repAccNums = rep_accnums_value,
+ msa_rep_num = 3,
+ app_data = app_data,
+ PhyloSelect = PhyloSelect,
+ acc2Name = acc_to_name_value,
+ rval_phylo = rval_phylo,
+ msa_reduce_by = msa_reduce_by
+ )
+
+ # Render RMarkdown report
+ rmarkdown::render(tempReport,
+ output_file = output_file,
+ params = params,
+ envir = new.env(parent = globalenv()))
+ browseURL(output_file)
+ }, error = function(e) {
+ return(paste("Error in report generation:", e$message))
+ })
+
+ return("Initialization, upload, and
+ report generation completed successfully.")
+}
+
diff --git a/R/ipr2viz.R b/R/ipr2viz.R
index e582ab09..51650761 100644
--- a/R/ipr2viz.R
+++ b/R/ipr2viz.R
@@ -79,7 +79,7 @@ themeGenes2 <- function() {
#' n = 20, query = "specific_query_name")
#' }
getTopAccByLinDomArch <- function(infile_full,
- DA_col = "DomArch.Pfam",
+ DA_col = "DomArch.PfamA",
lin_col = "Lineage_short",
n = 20,
query) {
@@ -94,14 +94,14 @@ getTopAccByLinDomArch <- function(infile_full,
cln_domarch <- cln %>% select(domarch_cols)
col_counts <- colSums(is.na(cln_domarch))
DA_sym <- sym(names(which.min(col_counts)))
- showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations"))
+ # showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations"))
## Group by Lineage, DomArch and reverse sort by group counts
grouped <- cln %>%
group_by({{ DA_sym }}, {{ lin_sym }}) %>%
arrange(desc(PcPositive)) %>%
summarise(count = n(), AccNum = dplyr::first(AccNum)) %>%
arrange(-count) %>%
- filter({{ lin_sym }} != "" && {{ DA_sym }} != "")
+ filter({{ lin_sym }} != "" & {{ DA_sym }} != "")
top_acc <- grouped$AccNum[1:n]
top_acc <- na.omit(top_acc)
return(top_acc)
@@ -180,14 +180,14 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = MolEvolvR::iprscan_cols)
ipr_out <- ipr_out %>% filter(.data$Name %in% accessions)
analysis_cols <- paste0("DomArch.", analysis)
- infile_full <- infile_full %>% select(.data$analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
+ infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
## To filter by Analysis
analysis <- paste(analysis, collapse = "|")
## @SAM: This can't be set in stone since the analysis may change!
## Getting top n accession numbers using getTopAccByLinDomArch()
top_acc <- getTopAccByLinDomArch(
infile_full = infile_full,
- DA_col = "DomArch.Pfam",
+ DA_col = "DomArch.PfamA",
## @SAM, you could pick by the Analysis w/ max rows!
lin_col = "Lineage_short",
n = topn, query = query
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
analysis_labeler <- analyses %>%
pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
- lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
+ lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
@@ -360,7 +360,7 @@ plotIPR2VizWeb <- function(infile_ipr,
## @SAM, colnames, merges, everything neeeds to be done now based on the
## combined lookup table from "common_data"
- lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
+ lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
## Read IPR file and subset by Accessions
diff --git a/R/networks_domarch.R b/R/networks_domarch.R
index ae6fe8be..b171b3a8 100755
--- a/R/networks_domarch.R
+++ b/R/networks_domarch.R
@@ -24,20 +24,20 @@
#' A network of domains is returned based on shared domain architectures.
#'
#' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes
+#' @param column Name of column containing Domain architecture from which nodes
#' and edges are generated.
#' @param domains_of_interest Character vector specifying domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
#' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
#' Lineage.
#' @param layout Character. Layout type to be used for the network. Options are:
#' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Character. Color to represent the queried domain in the
+#' @param query_color Character. Color to represent the queried domain in the
#' network.
#'
#' @importFrom dplyr across add_row all_of distinct filter mutate pull select
-#' @importFrom igraph delete_vertices graph_from_edgelist vertex
+#' @importFrom igraph delete_vertices graph_from_edgelist vertex V
#' @importFrom purrr map
#' @importFrom rlang sym
#' @importFrom shiny showNotification
@@ -174,21 +174,21 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
if ("X" %in% V(g)$name) {
g <- delete_vertices(g, "X")
}
- V(g)$size <- as.numeric(wc[V(g)$name])
+ igraph::V(g)$size <- as.numeric(wc[igraph::V(g)$name])
- V(g)$size <- (V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 20 + 10 # scaled by degree
+ igraph::V(g)$size <- (igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 20 + 10 # scaled by degree
# setting vertex color by size
- V(g)$color <- rainbow(5, alpha = .5)[round((V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 4 + 1)]
- V(g)$frame.color <- V(g)$color
+ igraph::V(g)$color <- rainbow(5, alpha = .5)[round((igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 4 + 1)]
+ igraph::V(g)$frame.color <- igraph::V(g)$color
# scaling edge width
- E(g)$width <- e.sz
- E(g)$width <- ifelse(log(E(g)$width) == 0, .3, log(E(g)$width))
+ igraph::E(g)$width <- e.sz
+ igraph::E(g)$width <- ifelse(log(igraph::E(g)$width) == 0, .3, log(igraph::E(g)$width))
# coloring edges by width
ew <- c(2.7, 4.5)
- E(g)$color <- sapply(
- E(g)$width,
- function(x) if (x >= ew[1] && x <= ew[2]) E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) E(g)$color <- adjustcolor("maroon", alpha.f = .5) else E(g)$color <- "gray55"
+ igraph::E(g)$color <- sapply(
+ igraph::E(g)$width,
+ function(x) if (x >= ew[1] && x <= ew[2]) igraph::E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) igraph::E(g)$color <- adjustcolor("maroon", alpha.f = .5) else igraph::E(g)$color <- "gray55"
)
vis_g <- visIgraph(g, type = "full")
}
@@ -211,7 +211,7 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
visOptions(highlightNearest = TRUE)
},
error = function(e) {
- showNotification(toString(e))
+ # showNotification(toString(e))
vis_g <- "error"
},
finally = {
@@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
#'
#'
#' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes
+#' @param column Name of column containing Domain architecture from which nodes
#' and edges are generated.
#' @param domains_of_interest Character vector specifying the domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
#' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
#' Lineage.
#' @param layout Character. Layout type to be used for the network. Options are:
#' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Color that the nodes of the domains in the
+#' @param query_color Color that the nodes of the domains in the
#' domains_of_interest vector are colored
-#' @param partner_color Color that the nodes that are not part of the
+#' @param partner_color Color that the nodes that are not part of the
#' domains_of_interest vector are colored
#' @param border_color Color for the borders of the nodes.
#' @param IsDirected Is the network directed? Set to false to eliminate arrows
diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R
index 75cc375d..6e2e8fd7 100644
--- a/R/pre-msa-tree.R
+++ b/R/pre-msa-tree.R
@@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
id = accessions_partitioned[[x]],
db = "protein",
rettype = "fasta",
- api_key = Sys.getenv("ENTREZ_API_KEY")
+ # api_key = Sys.getenv("ENTREZ_API_KEY")
)
)
})
@@ -809,9 +809,9 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
abort("Error: The FASTA file does not exist: ", fasta_file)
}
- if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
- abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
- }
+ # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
+ # abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
+ # }
fasta <- readAAStringSet(fasta_file)
aligned <- switch(tool,
@@ -869,11 +869,11 @@ writeMSA_AA2FA <- function(alignment, outpath) {
abort("Error: The output directory does not exist: ", outdir)
}
- l <- length(rownames(alignment))
+ l <- length(names(unmasked((alignment))))
fasta <- ""
for (i in 1:l)
{
- fasta <- paste0(fasta, paste(">", rownames(alignment)[i]), "\n")
+ fasta <- paste0(fasta, paste(">", names(unmasked((alignment)))[i]), "\n")
seq <- toString(unmasked(alignment)[[i]])
fasta <- paste0(fasta, seq, "\n")
}
@@ -955,4 +955,4 @@ getAccNumFromFA <- function(fasta_file) {
# cfile <- read_delim("data/alignments/pspc.gismo.aln", delim=" ")
# cfile <- as.data.frame(map(cfile,function(x) gsub("\\s+", "",x)))
# colnames(cfile) <- c("AccNum", "Alignment")
-# }
+# }
\ No newline at end of file
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
new file mode 100644
index 00000000..fe1fa4c9
--- /dev/null
+++ b/R/run_pipeline.R
@@ -0,0 +1,1343 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# import libs
+#' @import readr data.table httr rentrez iprscanr
+
+getSeqs <- function(sequences,
+ acc_file_path = "accs.txt",
+ dir_path = "~",
+ separate = TRUE) {
+ seqs <- readAAStringSet(sequences)
+ cln_names <- c()
+ for (accnum in names(seqs)) {
+ if (grepl("\\|", accnum)) {
+ accnum_cln <- strsplit(accnum, "\\|")[[1]][2]
+ accnum_cln <- strsplit(accnum_cln, " ")[[1]]
+ } else {
+ accnum_cln <- strsplit(accnum, " ")[[1]][1]
+ }
+ cln_names <- append(cln_names, accnum_cln)
+ write(accnum_cln, file = acc_file_path, append = TRUE)
+ if (separate) {
+ write(paste0(dir_path, "/", accnum_cln, ".faa"),
+ file = "input.txt", append = TRUE)
+ write(paste0(">", accnum_cln),
+ file = paste0(accnum_cln, ".faa"), append = TRUE)
+ write(toString(seqs[accnum]),
+ file = paste0(accnum_cln, ".faa"), append = TRUE)
+ }
+ }
+ names(seqs) <- cln_names
+ writeXStringSet(seqs, sequences, format = "fasta")
+ return(length(seqs))
+}
+runFull <- function(
+ dir = "/data/scratch",
+ DB = Sys.getenv("BLAST_DB", unset = "refseq"),
+ NHITS = Sys.getenv("BLAST_HITS", unset = 100),
+ EVAL = Sys.getenv("BLAST_EVALUE", unset = 0.00001),
+ sequences = "~/test.fa",
+ phylo = "FALSE",
+ by_domain = "FALSE",
+ domain_starting = "~/domain_seqs.fa",
+ type = "full",
+ job_code=NULL,
+ submitter_email=NULL,
+ advanced_options=NULL,
+ get_slurm_mails=FALSE,
+ APPLICATION
+) {
+ # Set working directory
+ setwd(dir)
+
+ advanced_options_names <- names(advanced_options[advanced_options == TRUE])
+
+ # Write job submission params to file
+ job_args <- list(
+ submission_type = type,
+ database = ifelse(phylo == FALSE, DB, NA),
+ nhits = ifelse(phylo == FALSE, NHITS, NA),
+ evalue = ifelse(phylo == FALSE, EVAL, NA),
+ submitter_email = submitter_email,
+ advanced_options = advanced_options_names,
+ job_code = job_code
+ )
+ yml <- yaml::as.yaml(job_args)
+ write(yml, "job_args.yml")
+
+ # Create a log file
+ write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa
+ \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2DomArch\tduration",
+ "logfile.tsv")
+
+ # Process sequences (local handling)
+ if (phylo == "FALSE") {
+ # Split the sequences if needed, store them locally
+ num_seqs <- getSeqs(sequences, dir_path = dir, separate = TRUE)
+
+ fasta <- Biostrings::readAAStringSet(sequences)
+ headers_original <- names(fasta)
+ headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x))
+
+ # Execute BLAST locally (instead of submitting jobs to the cluster)
+ for (i in 1:num_seqs) {
+ # Assume each sequence is saved separately
+ input_file <- paste0(dir, "/", headers_accnum[i], ".faa")
+ # output_file <- paste0(dir, "/blast_output_", i, ".txt")
+
+ # Construct the local BLAST command (make sure 'blastn' is available locally)
+ runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i, APPLICATION = APPLICATION)
+
+ #cmd <- sprintf(
+ # "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote",
+ # input_file, DB, output_file, NHITS, EVAL
+ #)
+
+ # Execute BLAST locally
+ #system(cmd)
+
+ cat(sprintf("BLAST for sequence %d completed.\n", i), file=stderr())
+ }
+ } else {
+ # Handle phylogenetic analysis if needed
+ cat("Phylogenetic analysis is not supported in the local version yet.\n")
+ }
+
+ # Simulate query run locally
+ runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type, APPLICATION = APPLICATION)
+ # cmd_query <- sprintf(
+ # "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote",
+ # sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL
+ # )
+ # system(cmd_query)
+
+ cat("Query analysis completed.\n")
+
+ # Status update
+ num_runs <- num_seqs + 1
+ write(paste0("0/", num_runs, " analyses completed"), "status.txt")
+
+ cat("All analyses completed locally.\n")
+}
+
+# Define the main pipeline function
+runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
+ is_query, type, i, APPLICATION) {
+
+ # Start time
+ start <- Sys.time()
+ OUTPATH <- getwd() # Set output directory to the current working directory
+
+ # If IS_QUERY is True, handle query data
+ if (is_query == TRUE) {
+
+ FILE <- input_paths
+ PREFIX <- "query_data"
+ OUTDIR <- file.path(OUTPATH, PREFIX)
+ dir.create(OUTDIR, showWarnings = FALSE) # Create output directory
+
+ all_accnums_file <- file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa"))
+ file.copy(FILE, all_accnums_file)
+
+ # Parse accession numbers
+ accnums_file <- file.path(OUTPATH, "query-fasta_header-map.tsv")
+ parsed_accnums_file <- file.path(OUTDIR, "parsed_accnums.txt")
+
+ if (file.exists(accnums_file)) {
+ parsed_accnums <- read.table(accnums_file,
+ header = FALSE, sep = "\t",
+ stringsAsFactors = FALSE)
+ writeLines(parsed_accnums$V2[-1], parsed_accnums_file) # Skip header
+ }
+
+ # Copy starting_accs.txt if exists, or fallback to accs.txt
+ if (file.exists("starting_accs.txt")) {
+ file.copy("starting_accs.txt",
+ file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")))
+ } else {
+ file.copy("accs.txt",
+ file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")))
+ file.copy(parsed_accnums_file,
+ file.path(OUTDIR, paste0(PREFIX, ".parsed_accnums.txt")))
+ }
+
+ # setwd(OUTDIR)
+
+ # Run acc2info
+ runAcc2Info(parsed_accnums_file, PREFIX, OUTDIR)
+
+ replaceAccNums(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+ file.path(OUTPATH, "query-fasta_header-map.tsv"),
+ file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")))
+
+ file.copy(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+ file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
+
+ } else {
+ # Handle homolog data
+ FILE <- readLines(input_paths)[1] # Assume single file for local usage
+ F_value <- basename(FILE)
+ PREFIX <- sub("\\.faa$", "", basename(FILE))
+ PREFIX <- gsub(">", "", PREFIX) # Extract prefix from file name
+ OUTDIR <- file.path(OUTPATH, paste0(PREFIX))
+ dir.create(OUTDIR, showWarnings = FALSE) # Create output directory
+ # setwd(OUTDIR)
+
+ # Run DELTABLAST
+ runDELTABLAST(input_paths, PREFIX, OUTDIR, db, nhits, eval)
+
+ # Run ACC2FA
+ convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+ PREFIX, OUTDIR)
+
+ # Run ACC2INFO
+ runAcc2Info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")),
+ PREFIX, OUTDIR)
+
+ # Clean up BLAST results
+ cleanupBlast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+ file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+ PREFIX, F)
+
+ }
+
+ # Sys.sleep(30)
+
+ # Run BLASTCLUST
+ runCDHIT(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+ PREFIX, OUTDIR )
+
+ # Convert clusters to table
+ clust2Table(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")),
+ file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
+
+ # Run INTERPROSCAN
+ runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+ prefix = PREFIX, outdir = OUTDIR, appl = APPLICATION)
+
+ # Run IPR2LIN
+ ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
+ file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX)
+
+
+ # Optionally run ipr2DomArch if IS_QUERY is true
+ if (is_query == TRUE) {
+ ## perform ipr2DomArch on iprscan results
+ da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+ PREFIX, "NA")
+
+ ## if blast results are provided, call appendIPR
+ if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
+ is.na(PREFIX)) {
+ print("No blast results provided, moving on.")
+ } else {
+ file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")),
+ file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv")))
+ }
+ } else {
+ # perform ipr2DomArch on iprscan results
+ da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+ PREFIX)
+
+ ## if blast results are provided, call appendIPR
+ if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
+ is.na(PREFIX)) {
+ print("No blast results provided, moving on.")
+ } else {
+ appendIPR(ipr_da = da,
+ blast = file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")),
+ prefix = PREFIX)
+ }
+ }
+
+ # Copy the input fasta file to the output directory
+ # file.copy(input_paths, OUTDIR)
+
+ # Total run time
+ dur <- difftime(Sys.time(), start, units = "secs")
+ cat("\nTotal run time:", dur, "seconds\n")
+
+ # Log the run times
+ logfile_path <- file.path(OUTPATH, "logfile.tsv")
+ log_data <- data.frame(
+ START_DT = format(start, "%d/%m/%Y-%H:%M:%S"),
+ STOP_DT = format(Sys.time(), "%d/%m/%Y-%H:%M:%S"),
+ PREFIX = PREFIX,
+ dur = dur
+ )
+
+ write.table(log_data, file = logfile_path, sep = "\t",
+ row.names = FALSE, col.names = FALSE,
+ append = TRUE, quote = FALSE)
+}
+
+# Define the acc2info function
+acc2info <- function(infile, prefix, outdir) {
+ # Ensure output directory exists
+ if (!dir.exists(outdir)) {
+ dir.create(outdir, recursive = TRUE)
+ }
+
+ outfile <- file.path(outdir, paste0(prefix, ".acc2info.tsv"))
+
+ # Print column names to the output file
+ # Print column names to outfile
+
+ # Read the input file of accession numbers
+ acc_nums <- readLines(infile)
+
+ # Random sleep to avoid overloading the server
+ # Sys.sleep(sample(1:10, 1))
+
+ # EPOST to get WebEnv and QueryKey
+ epost_response <- entrez_post(db = "protein", id = acc_nums)
+ webenv <- epost_response$WebEnv
+ query_key <- epost_response$QueryKey
+ # EFETCH to get the document summaries
+ docsums <- entrez_summary(db = "protein", web_history = epost_response)
+
+ # Check if any atomic values exist in docsums
+ any_atomic <- FALSE
+ # Check if any value in the current docsum is atomic
+ for (docsum in docsums) {
+ # Check if any value in the current docsum is atomic
+ if (is.atomic(docsum)) {
+ any_atomic <- TRUE
+ break # Exit loop early if an atomic value is found
+ }
+ }
+
+ if (any_atomic) {
+ parsed_data <- data.frame(
+ AccNum = docsums$oslt$value,
+ AccNum.noV = docsums$caption,
+ FullAccNum = docsums$extra,
+ Description = docsums$title,
+ Length = docsums$slen,
+ TaxID = docsums$taxid,
+ Species = docsums$organism,
+ SourceDB = ifelse(is.null(docsums$sourcedb), NA,
+ docsums$sourcedb),
+ Completeness = ifelse(is.null(docsums$completeness), NA,
+ docsums$completeness),
+ stringsAsFactors = FALSE # Avoid factors in data frame
+ )
+ } else {
+ # If no atomic values exist, use lapply to parse each docsum
+ parsed_data <- do.call(rbind, lapply(docsums, function(docsum) {
+ tryCatch({
+ data.frame(
+ AccNum = docsum$oslt$value,
+ AccNum.noV = docsum$caption,
+ FullAccNum = docsum$extra,
+ Description = docsum$title,
+ Length = docsum$slen,
+ TaxID = docsum$taxid,
+ Species = docsum$organism,
+ SourceDB = ifelse(is.null(docsum$sourcedb), NA,
+ docsum$sourcedb),
+ Completeness = ifelse(is.null(docsum$completeness), NA,
+ docsum$completeness),
+ stringsAsFactors = FALSE # Avoid factors in data frame
+ )
+ }, error = function(e) {
+ # Return a data frame filled with NA values in case of an error
+ return(data.frame(
+ AccNum = NA,
+ AccNum.noV = NA,
+ FullAccNum = NA,
+ Description = NA,
+ Length = NA,
+ TaxID = NA,
+ Species = NA,
+ SourceDB = NA,
+ Completeness = NA,
+ stringsAsFactors = FALSE
+ ))
+ })
+ }))
+ }
+
+ # Parse the fetched data
+ # Check if we got any data
+ if (nrow(parsed_data) == 0) {
+ cat("No data found for accession numbers in NCBI. Trying UniProt...\n")
+
+ # Split input file into smaller chunks
+ split_files <- split(acc_nums, ceiling(seq_along(acc_nums) / 100))
+
+ for (i in seq_along(split_files)) {
+ accnum <- paste(split_files[[i]], collapse = ",") # Join with comma
+ url <- paste0("https://www.ebi.ac.uk/proteins/api/proteins?accession=",
+ accnum)
+
+ response <- GET(url, accept("application/xml"))
+
+ if (status_code(response) == 200) {
+ xml_content <- content(response, as = "text")
+
+ # Parse the XML response
+ parsed_uniprot <- xml2::read_xml(xml_content)
+
+ # Extract required elements (customize based on XML structure)
+ entries <- xml2::xml_find_all(parsed_uniprot, "//entry")
+
+ for (entry in entries) {
+ accession <- xml2::xml_text(xml2::xml_find_first(entry,
+ ".//accession"))
+ full_name <- xml2::xml_text(xml2::xml_find_first(entry,
+ ".//fullName"))
+ length_seq <- xml2::xml_attr(xml2::xml_find_first(entry,
+ ".//sequence"),
+ "length")
+ db_reference <- xml2::xml_attr(xml2::xml_find_first(entry,
+ ".//dbReference"),
+ "id")
+ dataset <- xml2::xml_attr(entry, "dataset")
+ name <- xml2::xml_text(xml2::xml_find_first(entry, ".//name"))
+
+ # Create a row of data
+ new_row <- data.frame(AccNum = accession,
+ AccNum.noV = gsub("\\|", "", accession),
+ FullAccNum = accession,
+ Description = full_name,
+ Length = as.integer(length_seq),
+ TaxID = db_reference,
+ Species = name,
+ SourceDB = dataset,
+ Completeness = "NA", # Adjust as needed
+ stringsAsFactors = FALSE)
+
+ # Append to the outfile
+ write.table(new_row, file = outfile, sep = "\t", col.names = TRUE,
+ row.names = FALSE, quote = FALSE, append = TRUE)
+ }
+ }
+ }
+ } else {
+ # Append NCBI data to outfile
+ write.table(parsed_data, file = outfile, sep = "\t", col.names = TRUE,
+ row.names = FALSE, quote = FALSE, append = TRUE)
+ }
+
+ cat("Data saved to:", outfile, "\n")
+}
+
+acc2InfoPhylo <- function(infile, outdir) {
+ # Ensure output directory exists
+ if (!dir.exists(outdir)) {
+ dir.create(outdir, recursive = TRUE)
+ }
+
+ outfile <- file.path(outdir, "acc2info.tsv")
+
+ # Create a data frame to store results
+ results <- data.frame(
+ Caption = character(),
+ Extra = character(),
+ Title = character(),
+ Length = character(),
+ TaxID = character(),
+ Organism = character(),
+ SourceDB = character(),
+ Completeness = character(),
+ stringsAsFactors = FALSE
+ )
+
+ # Process accession numbers in batches to avoid overloading
+ for (acc in acc_nums) {
+ # Fetch data using epost and efetch
+ response <- GET(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&format=xml&id=%s",
+ acc))
+
+ # Check if the request was successful
+ if (response$status_code == 200) {
+ # Parse the XML response
+ parsed_xml <- read_xml(content(response, "text"))
+
+ # Extract relevant information
+ doc_summary <- xml_find_all(parsed_xml, ".//DocumentSummary")
+
+ for (summary in doc_summary) {
+ results <- rbind(results, data.frame(
+ Caption = xml_text(xml_find_first(summary, ".//Caption")),
+ Extra = xml_text(xml_find_first(summary, ".//Extra")),
+ Title = xml_text(xml_find_first(summary, ".//Title")),
+ Length = xml_text(xml_find_first(summary, ".//Slen")),
+ TaxID = xml_text(xml_find_first(summary, ".//TaxId")),
+ Organism = xml_text(xml_find_first(summary, ".//Organism")),
+ SourceDB = xml_text(xml_find_first(summary, ".//SourceDb")),
+ Completeness = xml_text(xml_find_first(summary, ".//Completeness")),
+ stringsAsFactors = FALSE
+ ))
+ }
+ } else {
+ warning(sprintf("Failed to fetch data for accession %s: %s",
+ acc, response$status_code))
+ }
+ }
+
+ # Write results to the output file
+ write.table(results, file = outfile, sep = "\t", row.names = FALSE,
+ quote = FALSE)
+}
+
+# Main function to run based on the prefix
+runAcc2Info <- function(infile, prefix, outdir) {
+ if (prefix == "NA") {
+ acc2InfoPhylo(infile, outdir)
+ } else {
+ acc2info(infile, prefix, outdir)
+ }
+}
+
+subsAccnum4cc2Info <- function(df_acc2info, df_header_map) {
+ df_result <- df_header_map |>
+ # set column name in header map to match accnum col in acc2info
+ dplyr::rename(AccNum = header_accnum) |>
+ # join onto header map
+ dplyr::left_join(df_acc2info, by = "AccNum") |>
+ # deselect accnum
+ dplyr::select(-AccNum) |>
+ # set the accnum col to the cleaned form
+ dplyr::rename(AccNum = header_clean) |>
+ # rm excess columns from header map file
+ dplyr::select(-header_original)
+ return(df_result)
+}
+
+replaceAccNums <- function(path_acc2info,
+ path_query_header_map, path_out) {
+
+ # Read the input files
+ df_acc2info <- read_tsv(path_acc2info)
+ df_query_header_map <- read_tsv(path_query_header_map)
+
+ # Substitute accession numbers
+ df_acc2info_substituted <- subsAccnum4cc2Info(df_acc2info,
+ df_query_header_map)
+
+ # Print the substituted dataframe
+ print("### df_acc2info_substituted")
+ print(df_acc2info_substituted)
+
+ # Write the substituted dataframe to the output file
+ write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE)
+}
+
+runDELTABLAST <- function(infile, prefix, outdir,
+ db = "refseq_protein",
+ nhits = 5000, evalue = 1e-5,
+ threads = 10) {
+
+ # Prepare output file path
+ # outfile <- file.path(outdir, paste0(prefix, ".dblast.tsv"))
+
+ outfile <- paste0(outdir, "/" , prefix, ".dblast.tsv")
+
+ # Print I/O messages
+ cat("\nNow processing:", infile, "\n")
+ cat("Running against:", db, "\n")
+ cat("E-value: ≤", evalue, "\n")
+ cat("Top", nhits, "hits/alignments\n")
+ cat("Output filepath:", outfile, "\n")
+
+ # Designating database based on user input
+ dblast_db <- switch(db,
+ nr = "nr",
+ refseq_protein = "refseq_protein",
+ stop("Invalid database specified.
+ Choose either 'nr' or 'refseq'."))
+
+ # Core command for running DELTABLAST
+ command <- sprintf(
+ "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %s -outfmt '6 qacc sacc sseqid sallseqid stitle sscinames staxids pident length mismatch gapopen qstart qend qlen sstart send slen evalue bitscore ppos' -remote",
+ infile, dblast_db, outfile, nhits, evalue
+ )
+ # Run the DELTABLAST command
+ system(command, intern = TRUE)
+
+ cat("DELTABLAST completed.\n")
+}
+
+# This script converts AccNum to Fasta using NCBI's EDirect or EBI's API
+convertAccNum2Fasta <- function(infile, prefix, outdir) {
+
+ # Create the output file path
+ outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa"))
+
+ # Print status
+ cat("\n####################\n")
+ cat("BEGIN EDIRECT SEARCH\n")
+ cat("####################\n")
+ cat("Processing input file:", infile, "\n")
+
+ # Create temporary files based on the number of columns in input file
+ temp_acc_file <- file.path(outdir, paste0(prefix, ".all_accnums.txt"))
+ cols <- length(strsplit(readLines(infile, n = 1), "\t")[[1]])
+
+ if (cols > 1) {
+ # Extract unique homolog accession numbers from the 2nd column
+ cat("Creating temp file with 2nd column unique accessions...\n")
+ system(sprintf("awk -F '\\t' '{ print $2 }' %s | sort -u > %s", infile, temp_acc_file))
+ } else {
+ # Extract unique homolog accession numbers from the 1st column
+ cat("Creating temp file with 1st column unique accessions...\n")
+ system(sprintf("awk -F '\\t' '{ print $1 }' %s | sort -u > %s", infile, temp_acc_file))
+ }
+
+ # Split accessions into chunks of 1000
+ system(sprintf("split -l 1000 -e %s %s/acc", temp_acc_file, outdir))
+
+ # Fetch FASTA sequences
+ cat("\nObtaining FASTA files\n")
+ acc_files <- list.files(path = outdir, pattern = "^acc", full.names = TRUE)
+
+ for (x in acc_files) {
+ # Accession numbers in a comma-separated format
+ accnum <- paste(readLines(x), collapse = ",")
+ fasta_sequence <- entrez_fetch(db = "protein", id = accnum,
+ rettype = "fasta")
+ # Write the fetched sequence to the output file
+ writeLines(fasta_sequence, con = outfile)
+ }
+
+ # Check if any sequences were retrieved
+ num_seqs <- as.numeric(system(sprintf("grep '>' %s | wc -l", outfile),
+ intern = TRUE))
+
+ # If no sequences found, try fetching from EBI API
+ if (num_seqs < 1) {
+ cat("\nNo sequences retrieved. Trying EBI API...\n")
+ for (x in acc_files) {
+ accnum <- paste(readLines(x), collapse = ",")
+ system(sprintf(
+ "curl -X GET --header 'Accept:text/x-fasta' 'https://www.ebi.ac.uk/proteins/api/proteins?accession=%s' >> %s",
+ accnum, outfile
+ ))
+
+ }
+ }
+
+ # Remove temporary files
+ cat("\nRemoving temporary files\n")
+ system(sprintf("rm %s/acc*", outdir))
+
+ # Print end message
+ cat("#####################\n")
+ cat("END OF EDIRECT SEARCH\n")
+ cat("#####################\n")
+}
+
+cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
+
+ outdir <- dirname(infile_blast)
+ # Load and clean acc2info file
+ acc2info_out <- fread(input = acc2info, sep = "\t",
+ header = TRUE, fill = TRUE) %>%
+ mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
+ mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
+
+ # If using web-blast output
+ if (wblast == TRUE) {
+ blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE,
+ col.names = web_blastp_hit_colnames, fill = TRUE)
+ cleanedup_blast <- blast_out %>%
+ mutate(AccNum = gsub("\\|", "", AccNum)) %>%
+ mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>%
+ mutate(PcIdentity = round(as.double(PcIdentity), 2))
+
+ # Merge cleaned blast output with acc2info
+ cleanedup_blast <- merge(cleanedup_blast, acc2info_out,
+ by.x = "AccNum",
+ by.y = "FullAccNum",
+ all.x = TRUE)
+ names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species"
+
+ # Additional calculations for PcPositive
+ cleanedup_blast <- cleanedup_blast %>%
+ mutate(PcPosOrig = as.numeric(PcPosOrig)) %>%
+ mutate(AlnLength = as.numeric(AlnLength)) %>%
+ mutate(PcPositive = PcPosOrig)
+
+ } else if (wblast == FALSE) {
+ # If using classic-blast output
+ blast_out <- read_tsv(file = infile_blast, col_names = cl_blast_colnames)
+ cleanedup_blast <- blast_out %>%
+ mutate(AccNum = gsub("\\|", "", AccNum)) %>%
+ mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>%
+ mutate(Species = gsub(";.*$", "", Species)) %>%
+ mutate(PcIdentity = round(PcIdentity, 2)) %>%
+ mutate(PcPositive = round((PcPosOrig * AlnLength / QLength), digits = 2))
+
+ # Merge cleaned blast output with acc2info
+ cleanedup_blast <- merge(cleanedup_blast, acc2info_out,
+ by.x = "AccNum",
+ by.y = "FullAccNum",
+ all.x = TRUE) %>%
+ select(-Species.x, -TaxID.x)
+ names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species"
+ }
+
+ # TaxID to lineage mapping
+ cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
+ lineage_map <- fread(
+ system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+ header = TRUE, fill = TRUE, colClasses = lineage_map_cols)
+
+ # Merge with lineage map and clean up columns
+ mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID",
+ all.x = TRUE) %>%
+ mutate(Species = Species.y, Spp.blast = Species.x) %>%
+ select(any_of(cl_blast_postcln_cols))
+
+ # Add names and prepare the output
+ blast_names <- addName(mergedLins)
+
+ # Create output file name
+ file_name <- file.path(outdir, paste0(prefix, ".blast.cln.tsv"))
+
+ # Write cleaned data to file
+ write_tsv(blast_names, file_name, col_names = TRUE)
+}
+
+# Function to run BLASTCLUST on given input
+runCDHIT <- function(infile, suffix, outdir) {
+
+ # Prepare output file path
+ outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
+ input_file <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv.clstr"))
+
+
+ # Print process messages
+ cat("\n#####################################\n")
+ cat("## Now running BLASTCLUST on file(s):", infile, "\n")
+ cat("#####################################\n")
+
+ # Cluster sequences w/ BLASTCLUST/CD-HIT
+ # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8")
+ cdhit_command <- sprintf(
+ "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
+ infile, outfile
+ )
+ # clean_cdhit_format <- sprintf(
+ # "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
+ # input_file,
+ # outfile
+ # )
+
+ cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
+ # system(blastclust_cmd)
+
+ system(cdhit_command)
+ # system(clean_cdhit_format)
+ cleanCDHIT(input_file, outfile)
+ cat("Cleaning CDHIT results completed")
+}
+
+# extract_sequences("input_file.tsv.clstr", "output_file.tsv")
+cleanCDHIT <- function(input_file, output_file) {
+ # Read the input file line by line
+ lines <- readLines(input_file)
+
+ # Initialize an empty vector to store extracted identifiers
+ extracted <- c()
+
+ # Loop through lines and extract content between ">" and "..."
+ for (line in lines) {
+ # Only process lines containing valid sequences (skip cluster headers)
+ if (grepl(">", line) && grepl("\\.\\.\\.", line)) {
+ # Extract the sequence identifier between ">" and "..."
+ match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line))
+ if (length(match) > 0) {
+ sequence <- gsub("^>", "", match) # Remove the leading ">"
+ sequence <- gsub("\\.\\.\\.$", "", sequence) # Remove trailing dots
+ extracted <- c(extracted, sequence)
+ }
+ }
+ }
+
+ # Write the extracted sequences to the output file
+ writeLines(extracted, output_file)
+
+ cat("Sequences extracted and written to:", output_file, "\n")
+}
+
+# Function to format blastclust output
+clust2Table <- function(clust, blast) {
+ clust_out <- read_tsv(file = clust, col_names = F)
+ blast_out <- read_tsv(file = blast, col_names = T)
+ ## Count the number of accession numbers in a cluster
+ # Counting number of spaces between acc. no. +1
+ clust_out$NumAccs <- map(.x = clust_out$X1, function(x) {
+ (str_count(string = x, pattern = " ") + 1)
+ })
+ ## Create empty vectors to store information
+ empty_vec <- c("ClusterID")
+ empty_vec2 <- c("RowNum")
+ ## Adding empty vectors to dataframe
+ clust_out[, empty_vec] <- NA
+ clust_out[, empty_vec2] <- NA
+ ## Counting number of rows to add to the RowNum column -- used for creating cluster name
+ rows <- as.numeric(rownames(clust_out)) %>%
+ str_pad(width = 4, pad = 0)
+ ## Add row number info to dataframe
+ clust_out[, "RowNum"] <- rows
+ # Name columns
+ colnam <- list("AccNum", "NoAccs", "ClusterID", "NumRows")
+ ## Create cluster name from row number (num of clusters) and num of accessions
+ clust_out <- clust_out %>%
+ `colnames<-`(colnam) %>%
+ mutate(ClusterID = paste0(NumRows, ".", NoAccs))
+
+ # Store data frame column as vector
+ myvar <- c("AccNum", "ClusterID")
+ ## Add only the 2 colummns wanted to a new varible
+ clusters <- clust_out[myvar]
+ # Initialize empty data frame
+ new_clust <- data.frame(ClusterID = character(0),
+ AccNum = character(0),
+ stringsAsFactors = F)
+
+ ## Assigning each sseqid to a ClusterID
+ # Iterate over dataframe
+ for (i in 1:nrow(clusters)) {
+ # Extract the cluster name for this row
+ cname <- clusters$ClusterID[i]
+ # Split accession numbers by a space
+ # vals is a vecto of all accession numbers in a row
+ vals <- clusters$AccNum[i] %>%
+ strsplit(split = " ") %>%
+ unlist()
+
+ # Iterate over each element in vals
+ for (v in vals) {
+ # add each accession num w/ corresponding cluster ID to new df
+ new_clust[nrow(new_clust) + 1, ] <- c(cname, v)
+ }
+ }
+ # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum)
+ # blast_out$AccNum <- paste0(">", blast_out$AccNum)
+ blast_clustnames <- merge(blast_out, new_clust, by = "AccNum")
+
+ for (i in 1:nrow(blast_clustnames)) {
+ blast_clustnames$ClusterID[i] <- str_c(blast_clustnames$ClusterID[i])
+ }
+
+ first_prot <- as.data.frame(word(clust_out$AccNum), word(clust_out$ClusterID))
+
+ ## write the new file as a TSV file
+ newarg <- gsub(".bclust.L[0-9][0-9]S[0-9][0-9].tsv", "", clust)
+ # accnum + clusterID
+ write_tsv(new_clust, file = paste0(newarg, ".clustIDs"), append = F)
+ # first protein from every cluster
+ write_tsv(first_prot, file = paste0(newarg, ".clust_reps"),
+ col_names = F, append = F)
+ # cleaned up blast file + clusterIDs
+ write_tsv(blast_clustnames, file = paste0(newarg, ".cln.clust.tsv"),
+ col_names = T, append = F)
+}
+
+# Function to run InterProScan
+runIPRScan2 <- function(query_file, prefix, outdir, appl) {
+
+ # Start InterProScan run
+ cat("\n######################\n")
+ cat("BEGIN INTERPROSCAN RUN\n")
+ cat("Input file:", query_file, "\n")
+ cat("######################\n")
+
+ # Output file path
+ outfile <- file.path(outdir, paste0(prefix, ".iprscan"))
+
+ # Process the input query file with InterProScan
+ cat("Now processing", query_file, "\n")
+
+ # Run InterProScan command
+ # Construct the command
+
+ # get the path to the interproscan.sh script from the environment
+ # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified
+ # iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh")
+ #
+ # command <- paste(
+ # iprscan_cmd, "-i",
+ # shQuote(query_file),
+ # "-b", shQuote(outfile),
+ # "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
+ # "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,",
+ # "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK"
+ # )
+
+ # Run the command
+ # system(command)
+ submit_ipr(path2seq = query_file,
+ outfolder = outdir,
+ appl = appl,
+ email = "jravilab.msu@gmail.com")
+
+ new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
+ "SignDesc", "StartLoc", "StopLoc", "Score",
+ "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+ temp_data <- read_tsv(file.path(outdir, paste0("ipr_joined.tsv")))
+
+ # Drop the last two columns
+ temp_data <- temp_data[, -((ncol(temp_data) - 1):ncol(temp_data))]
+
+ # remove last 10 xters in AccNum
+ temp_data$file_name <- substr(temp_data$file_name, 1,
+ nchar(temp_data$file_name) - 12)
+
+ colnames(temp_data) <- new_header
+
+ write.table(temp_data, file.path(outdir, paste0(prefix, ".iprscan.tsv")),
+ sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
+
+
+ cat("##################\n")
+ cat("END OF IPRSCAN RUN\n")
+ cat("##################\n")
+}
+
+ipr2Linear <- function(ipr, acc2info, prefix) {
+ # read in iprscan results
+ # duplicate rows in iprscan file
+ ipr_in <- read_tsv(ipr, col_names = TRUE) %>%
+ mutate(DB.ID = gsub("G3DSA:", "", DB.ID))
+
+ if (prefix == "query_data") {
+ ipr_in$AccNum <- sub("(.*)([0-9])([0-9])$", "\\1.\\2_\\3", ipr_in$AccNum)
+ }
+
+ acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>%
+ mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
+ mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
+
+ # merge ipr file with acc2info file
+ ipr_in <- ipr_in %>%
+ # remove version number and any other suffices
+ mutate(AccNum.noV = gsub("\\.[0-9].*", "", AccNum))
+
+ ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
+
+ # read in lineage map
+ lineage_map <- fread(
+ system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+ header = TRUE, fill = TRUE)
+
+ # merge ipr+info w/ lineage
+ # both tables have a species column, but only
+ # the lineage_map (y) species column is kept
+ ipr_tax <- ipr_tax %>%
+ mutate(TaxID = as.numeric(TaxID))
+
+ ipr_lin <- left_join(ipr_tax, lineage_map, by = "TaxID") |>
+ mutate(Species = Species.y) %>%
+ select(-Species.x, -Species.y)
+
+ # add lookup table to iprscan file
+ lookup_tbl <- fread(
+ system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE),
+ sep = "\t", header = TRUE, fill = TRUE) %>%
+ distinct()
+ if ("AccNum.x" %in% names(ipr_lin)) {
+ ipr_lin <- ipr_lin %>%
+ # deselect the AccNum.y from the lineage table (y) and set
+ # the AccNum.x (x) from the ipr/acc2info tables to simply 'AccNum'
+ mutate(AccNum = AccNum.x) %>%
+ select(-AccNum.x, -AccNum.y)
+ }
+ # run add_name f(x) on ipr+lineage dataframe
+ ipr_lin <- ipr_lin %>%
+ addName() %>%
+ mutate(Name = gsub("^_", "", Name))
+
+ # add domarch info to iprscan + lineage df, only keep what's in x
+ ipr_cln <- left_join(ipr_lin, lookup_tbl, by = "DB.ID")
+
+ # populate empty description/short name columns
+ for (i in 1:nrow(ipr_cln)) {
+ if ((is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") &&
+ (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-")) {
+
+ ipr_cln$SignDesc[i] <- ipr_cln$IPRDesc[i]
+ if (length(ipr_cln$LookupTblDesc[i]) != 0) {
+ ipr_cln$ShortName[i] <- ipr_cln$LookupTblDesc[i]
+ }
+ }
+ if (is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") {
+ ipr_cln$ShortName[i] <- ipr_cln$SignDesc[i]
+ }
+ if (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-") {
+ ipr_cln$SignDesc[i] <- ipr_cln$ShortName[i]
+ }
+ }
+ # rename unclear/duplicated columns
+ names(ipr_cln)[names(ipr_cln) == "Description.x"] <- "ProteinName"
+ names(ipr_cln)[names(ipr_cln) == "Description.y"] <- "LookupTblDesc"
+ # deselect the AccNum.noV from the lineage table (y) and set
+ # the AccNum.noV.x (x) from the ipr/acc2info tables to simply 'AccNum.noV'
+ ipr_cln <- ipr_cln |> dplyr::select(-AccNum.noV.y) |>
+ dplyr::mutate(AccNum.noV = AccNum.noV.x) |>
+ dplyr::select(-AccNum.noV.x)
+ # create label column to use in ipr2viz
+ ipr_cln <- ipr_cln %>%
+ mutate(Label = strtrim(ShortName, 30)) %>%
+ mutate(Label = gsub(", .*", "", Label)) %>%
+ mutate(Label = gsub("C-terminal region of a signal",
+ "C-term signal peptide", Label)) %>%
+ mutate(Label = gsub("N-terminal region of a signal",
+ "N-term signal peptide", Label)) %>%
+ mutate(Label = gsub("Twin arginine translocation \\(T",
+ "Tat signal profile", Label)) %>%
+ mutate(Label = gsub("GLUTATHIONE HYDROLASE PROENZYM",
+ "GLUTATHIONE HYDROLASE PROENZYME", Label)) %>%
+ mutate(Label = gsub("N-terminal nucleophile aminohy",
+ "Ntn hydrolases", Label)) %>%
+ mutate(Label = gsub("Region of a membrane-bound pro",
+ "cytoplasmic reg of mem-bound prot",
+ Label)) %>%
+ mutate(ShortName = gsub("Region of a membrane-bound protein predicted to be
+ outside the membrane, in the cytoplasm.",
+ "cytoplasmic reg of mem-bound prot", ShortName))
+ outdir <- dirname(ipr)
+ # write results to file
+ write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv"))))
+}
+
+ipr2DomArch <- function(infile_ipr, prefix,
+ analysis = c(
+ "PfamA", "SMART", "Phobius",
+ "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE",
+ "SuperFamily", "MobiDBLite", "Panther", "Coils"
+ )) {
+ # read in cleaned up iprscan results
+ ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols)
+
+ # split dataframe into unique proteins
+ x <- split(x = ipr_in, f = ipr_in$AccNum)
+
+ # plan(strategy = "multicore", .skip = T)
+
+ # within each data.table
+ domarch <- map(x, function(y) {
+ # domarch <- future_map(x, function(y) {
+ acc_row <- data.frame(AccNum = y$AccNum[1], stringsAsFactors = F)
+ DAs <- data.frame(matrix(nrow = 1, ncol = length(analysis)))
+ DA <- y %>%
+ group_by(Analysis) %>%
+ arrange(StartLoc)
+ i <- 1
+ analysis_var <- c(
+ "Pfam", "SMART", "Phobius",
+ "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
+ "SUPERFAMILY", "MobiDBLite", "PANTHER", "Coils"
+ )
+ for (a in analysis_var) {
+ a_da <- DA %>% filter(Analysis == a)
+ if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" ||
+ a == "SignalP_GRAM_POSITIVE") {
+ var_shortname <- "DB.ID"
+ } else {
+ var_shortname <- "ShortName"
+ }
+ var_shortname_sym <- sym(var_shortname)
+ a_da <- a_da %>%
+ ungroup() %>%
+ select({{ var_shortname_sym }}) %>%
+ filter(!is.na({{ var_shortname_sym }})) %>%
+ filter(!is.null({{ var_shortname_sym }})) %>%
+ pull(var_shortname) %>%
+ paste(collapse = "+")
+ DAs[1, i] <- a_da
+ i <- (i + 1)
+ }
+
+ colnames(DAs) <- paste("DomArch", analysis, sep = ".")
+ return(cbind(acc_row, DAs))
+ })
+
+ # select relevant rows from ipr input to add to domarch
+ ipr_select <- ipr_in %>%
+ select(Name, AccNum, Species, TaxID, Lineage, Lineage_long_na,
+ Lineage_long, Lineage_med, Lineage_short, ProteinName,
+ SourceDB, Completeness, AccNum.noV) %>%
+ distinct()
+
+ # combine domarchs to one data frame, merge w/ acc2info
+ domarch2 <- do.call(rbind.data.frame, domarch)
+
+ domarch_lins <- domarch2 %>%
+ merge(ipr_select, by = "AccNum", all.x = T)
+
+ # save domarch_lins file
+ write_tsv(domarch_lins,
+ file = paste0(prefix, "/", prefix, ".ipr_domarch.tsv"),
+ append = F, na = "NA"
+ )
+
+ # return domarch2 dataframe to append to blast results if given
+ return(domarch2)
+}
+
+## function to add results from ipr2DomArch to blast results
+appendIPR <- function(ipr_da, blast, prefix) {
+ # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining !
+ blast_out <- read_tsv(blast, col_names = T)
+ if ("AccNum.noV" %in% colnames(blast_out)) {
+ ipr_da <- read_tsv(paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), col_names = T)
+ blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T)
+ } else {
+ blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T)
+ }
+
+ write_tsv(blast_ipr, file = paste0(prefix, "/", prefix, ".full_analysis.tsv"), na = "NA")
+}
+
+# Web BLAST output
+web_blast_colnames <- c("Query", "AccNum",
+ "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+ "QStart", "QEnd", "SStart", "SEnd",
+ "EValue", "BitScore", "PcPosOrig",
+ "QSFrames") # specific to "blastx"
+
+# BLAST Command line
+cl_blast_colnames <- c("Query", "SAccNum", "AccNum",
+ "SAllSeqID", "STitle", "Species", "TaxID",
+ "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+ "QStart", "QEnd", "QLength",
+ "SStart", "SEnd", "SLength",
+ "EValue", "BitScore", "PcPosOrig",
+ "PcPositive", "ClusterID") # post-cleanup
+
+# IPRSCAN (web+command-line)
+ipr_colnames <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis",
+ "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+ "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+# RPSBLAST
+rps_colnames <- c("AccNum", "DBID", "DBSeqID",
+ "PcIdentity", "PcPosOrig", # Ppos missing
+ "AlnLength", "Mismatch",
+ # Q here is Subject; S here is the matching domain; rename!
+ "SStart", "SEnd", "DStart", "DEnd",
+ "EValue", "BitScore", "TaxID") # TaxID missing (NA); remove?
+
+# IPG
+ipg_colnames <- c("IPG.ID", "Source", "NucAccNum",
+ "NucStart", "NucStop", "Strand",
+ "AccNum", "ProtDesc",
+ "Species", "SppStrain", "AssemblyID")
+# Final ColNames
+combo_colnames <- c("Query", "AccNum", "Species", "TaxID", "Lineage",
+ "PcPositive", "ClusterID",
+ # "Leaf", # MISSING (useful for all dataviz)
+ # "AssemblyID", "GeneName", "ProtDesc", # MISSING NOW!?!
+ "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D",
+ "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP")
+
+
+## ############ ##
+## COLUMN names ##
+## ############ ##
+## BLAST
+############
+## Web-BLAST
+############
+## Downloaded as HIT-TABLE csv
+# BLASTP and related protein BLASTs
+web_blastp_hit_colnames <- c(
+ "Query", "AccNum",
+ "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+ "QStart", "QEnd", "SStart", "SEnd",
+ "EValue", "BitScore", "PcPosOrig"
+)
+# BLASTX
+web_blastx_colnames <- c(
+ "Query", "AccNum",
+ "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+ "QStart", "QEnd", "SStart", "SEnd",
+ "EValue", "BitScore", "PcPosOrig",
+ "QSFrames"
+) # specific to "blastx"
+
+## Downloaded as Descriptions csv
+# BLASTP and related protein BLASTs
+web_blastp_desc_colnames <- c(
+ "Description", "Species", "CommonName", "TaxID",
+ "BitScore", "TotalScore",
+ "PcQCover", "EValue", "PcIdentity",
+ "SLen", "AccNum"
+)
+
+#####################
+## Command line BLAST
+#####################
+
+# pre-cleanup
+cl_blast_colnames <- c(
+ "Query", "SAccNum", "AccNum",
+ "SAllSeqID", "STitle", "Species", "TaxID",
+ "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+ "QStart", "QEnd", "QLength",
+ "SStart", "SEnd", "SLength",
+ "EValue", "BitScore", "PcPosOrig"
+)
+
+# post-cleanup
+cl_blast_postcln_cols <- c(
+ "Query", "AccNum",
+ "STitle", "Species", "TaxID", "Lineage", "Lineage_long",
+ "Lineage_long_na", "Lineage_med", "Lineage_short",
+ "PcPositive", "PcIdentity", "AlnLength",
+ "SAccNum", "SAllSeqID",
+ "Mismatch", "GapOpen",
+ "QStart", "QEnd", "QLength",
+ "SStart", "SEnd", "SLength",
+ "EValue", "BitScore", "PcPosOrig", "QueryName"
+)
+
+##########
+## IPRSCAN
+##########
+# ipr_colnames_orig <- c("AccNum", "Seq_MD5_digest", "SeqLen", "Analysis",
+# "DB_ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+# "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+ipr_colnames <- c(
+ "AccNum", "SeqMD5Digest", "SLength", "Analysis",
+ "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+ "Status", "RunDate", "IPRAcc", "IPRDesc"
+)
+
+# post cleanup
+##########################
+## NEED TO BE REORDERED ##
+##########################
+ipr_cln_colnames <- c(
+ "DB.ID", "TaxID", "AccNum.noV", "AccNum",
+ "SeqMD5Digest", "SLength", "Analysis", "SignDesc",
+ "StartLoc", "StopLoc", "Score", "Status", "RunDate",
+ "IPRAcc", "IPRDesc", "FullAccNum", "ProteinName",
+ "Length", "SourceDB", "Completeness", "Lineage",
+ "Species", "Name", "ShortName", "LookupTblDesc",
+ "ID", "Label"
+)
+
+###########
+## RPSBLAST
+###########
+rps_colnames <- c(
+ "AccNum", "DB.ID", "DBSeqID",
+ "PcIdentity.Dom", "PcPosOrig.Dom", # "PcPos.Dom", # Ppos missing
+ "AlnLength", "Mismatch",
+ "SStart", "SEnd", "DStart", "DEnd",
+ "EValue", "BitScore", "TaxID"
+) # TaxID missing (NA); remove?
+
+#######################
+## IPG and Lineage maps
+#######################
+ipg_colnames <- c(
+ "IPG.ID", "Source", "NucAccNum",
+ "NucStart", "NucStop", "Strand",
+ "AccNum", "Description",
+ "Species", "Spp.Strain", "AssemblyID"
+)
+
+##################
+## Assembly files
+## Genbank, Refseq
+##################
+assembly_colnames <- c(
+ "AssemblyID",
+ "bioproject", "biosample", "wgs_master", # not used
+ "RefseqCategory", "TaxID", "Spp.TaxID",
+ "Species", "Spp.Strain",
+ "isolate", "version_status", # not used
+ "assembly_level", "release_type", # not used
+ "GenomeStatus",
+ "seq_rel_date", "asm_name", "submitter", # not used
+ "AssemblyID.GBRS",
+ "paired_asm_comp", "ftp_path", # not used
+ "excluded_from_refseq", "relation_to_type_material"
+) # not used
+assembly_sub_colnames <- c(
+ "TaxID", "Spp.TaxID", "Species", "Spp.Strain",
+ "RefseqCategory", "GenomeStatus",
+ "AssemblyID", "AssemblyID.GBRS"
+)
+
+#################
+## Lookup tables
+## in common_data
+#################
+lineage_lookup_colnames <- c("TaxID", "Species", "Lineage_long",
+ "Lineage_long_na", "Lineage_med",
+ "Lineage_short", "Lineage")
+domarch_lookup_colnames <- c("DB.ID", "ShortName", "Description", "ID")
+# !! SC and LS will fix other piecemeal files based on these
+
+######################
+## FINAL UPLOADED DATA
+######################
+## Combined data frame that is loaded on to the webapp
+combo_colnames <- c(
+ "Query", "UID", "AccNum", "Species", "TaxID", "Lineage",
+ "PcPositive", "ClusterID", "QueryName",
+ # "AssemblyID", "GeneName", "Description", # MISSING NOW!?!
+ "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D",
+ "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP",
+ "DomArch.SMART", "DomArch.TIGR"
+)
+
+
+################
+## read tsv colnames
+################
+lookup_table_cols <- cols(
+ DB.ID = col_character(),
+ ShortName = col_character(),
+ Description = col_character(),
+ ID = col_character()
+)
+
+iprscan_cols <- cols(
+ .default = col_character(),
+ TaxID = col_double(),
+ SLength = col_double(),
+ SignDesc = col_character(),
+ StartLoc = col_double(),
+ StopLoc = col_double(),
+ Score = col_double(),
+ Status = col_logical(),
+ IPRAcc = col_character(),
+ IPRDesc = col_character(),
+ Length = col_double(),
+ ShortName = col_character(),
+ LookupTblDesc = col_character(),
+ ID = col_character(),
+ Label = col_character()
+)
+
+ipr_cln_cols <- cols(
+ .default = col_character(),
+ TaxID = col_double(),
+ SLength = col_double(),
+ StartLoc = col_double(),
+ StopLoc = col_double(),
+ Score = col_double(),
+ Status = col_logical(),
+ IPRAcc = col_logical(),
+ IPRDesc = col_logical(),
+ Length = col_double(),
+ ID = col_logical()
+)
+
+lineage_map_cols <- c(
+ "double",
+ "character",
+ "character", "character", "character", "character", "character"
+)
diff --git a/R/summarize.R b/R/summarize.R
index 371473ce..b1fef89d 100644
--- a/R/summarize.R
+++ b/R/summarize.R
@@ -690,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
abort("Error: 'digits' must be a non-negative integer.")
}
- column <- sym(column)
+ # column <- sym(column)
prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
@@ -698,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
col_count <- prot %>%
- group_by({{ column }}) %>%
+ group_by(!!sym(column)) %>%
summarise(totalcount = sum(count))
- total <- left_join(prot, col_count, by = as_string(column))
+ total <- left_join(prot, col_count, by = column)
sum_count <- sum(total$count)
total <- total %>%
@@ -901,4 +901,4 @@ findParalogs <- function(prot) {
# cat("Word counts for broken up domains from DAs and DAs from GCs.
# \nFor e.g.:
# DA.doms.wc <- query.sub$DA.doms %>%
-# words2WordCounts()")
+# words2WordCounts()")
\ No newline at end of file
diff --git a/R/viz_utils.R b/R/viz_utils.R
new file mode 100644
index 00000000..4697204e
--- /dev/null
+++ b/R/viz_utils.R
@@ -0,0 +1,924 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# import libs
+#' @import dplyr stringr visNetwork DT plotly
+#'
+# Function to generate the InterProScan Visualization
+getIPRGenesVisualization <- function(data, app_data,
+ query_iprDatabases = c("PfamA", "Phobius", "TMHMM", "Gene3d"),
+ query_iprVisType = "Analysis") {
+
+ # Check if analysis is loaded
+ if (nrow(data@df) == 0 || app_data@ipr_path == "") {
+ stop("Analysis data is not loaded properly or ipr_path is missing.")
+ }
+
+ # If there is no cln_path or PcPositive column is NULL
+ if (length(data@cln_path) == 0 || is.null(data@df$PcPositive)) {
+
+ # Set column name for accessing based on the data
+ n <- if ("name" %in% colnames(data)) {
+ "name"
+ } else {
+ "AccNum"
+ }
+ n <- "Name" # Hardcoded to "Name" based on original code
+
+ # Call the `ipr2viz_web` function
+ ipr_plot <- plotIPR2VizWeb(
+ infile_ipr = data@ipr_path,
+ accessions = data@df$Name,
+ analysis = query_iprDatabases,
+ group_by = query_iprVisType,
+ name = n
+ )
+
+ } else {
+
+ # Call the `ipr2viz` function with additional arguments
+ ipr_plot <- plotIPR2Viz(
+ infile_ipr = data@ipr_path,
+ infile_full = data@df,
+ accessions = unique(data@df$Name),
+ analysis = query_iprDatabases,
+ group_by = query_iprVisType,
+ topn = 20, # This value is hardcoded in the original code
+ query = "All"
+ )
+ }
+
+ # Return the plot object for further use
+ return(ipr_plot)
+}
+
+# Function to generate the domain network layout visualization
+getRSNetworkLayout <- function(data, app_data,
+ cutoff = 100,
+ layout = "nice") {
+
+ # Check if analysis is loaded and app data has a valid ipr_path
+ if (nrow(data@df) == 0 || app_data@ipr_path == "") {
+ stop("Analysis not loaded or ipr_path missing.")
+ }
+
+ # Extract column names and find the first matching "DomArch" column
+ cols <- colnames(data@df)
+ if (is.null(cols)) {
+ stop("Dataframe columns are missing.")
+ }
+
+ col <- cols[grepl("DomArch.*", cols)][1]
+ if (is.null(col)) {
+ stop("No domain architecture column found.")
+ }
+
+ # Clean up domain architecture columns in the data
+ df_data <- data@df %>%
+ mutate(across(tidyselect::starts_with("DomArch"), cleanString))
+
+ # Generate the network using the domain_network function
+ res_network <- createDomainNetwork(
+ df_data,
+ column = col,
+ domains_of_interest = ".*",
+ cutoff = cutoff,
+ layout = layout
+ )
+
+ # Validate that the result is not an error
+ if (any(res_network == "error")) {
+ stop("Not enough nodes to construct a network.")
+ }
+
+ return(res_network)
+}
+
+# Function to generate the data table
+getDataTable <- function(data) {
+ if (nrow(data@df) == 0) {
+ stop("No data available.
+ Please ensure you have uploaded your data correctly.")
+ }
+
+ # Define the columns to be shown
+ viewing_cols <- c(
+ "AccNum", "QueryName", "Name", "Lineage", "Species",
+ "Length", "PcPositive", "DomArch.Pfam"
+ )
+
+ # Extract the data
+ d <- data@df
+
+ # Identify columns to hide (those not in viewing_cols)
+ hide_cols <- which(!colnames(d) %in% viewing_cols) - 1
+
+ # Add hyperlinks to the "AccNum" column
+ d$AccNum <- paste0("", d$AccNum, "")
+
+ # Create the DataTable
+ dt <- DT::datatable(d,
+ rownames = FALSE,
+ filter = "top", # Enable filtering
+ extensions = c("Buttons"), # Enable buttons (e.g., CSV export)
+ escape = FALSE, # Allow HTML rendering (for the hyperlinks)
+ options = list(
+ dom = "frlBtip", # Layout for the table (filters, buttons, etc.)
+ pageLength = 10, # Number of rows per page
+ paging = TRUE, # Enable pagination
+ # Enable regex search
+ search = list(caseInsensitive = TRUE, regex = TRUE),
+ language = list(
+ searchPlaceholder = "Regex or Text...",
+ filterPlaceholder = "Test"
+ ),
+ buttons = list(
+ list(
+ extend = "colvis",
+ text = "Add/remove column(s)"
+ ),
+ list(
+ extend = "csv",
+ text = "Download",
+ filename = "molevolvr-data",
+ exportOptions = list(
+ # Export all data, not just visible page
+ modifier = list(page = "all")
+ )
+ )
+ ),
+ scrollX = FALSE, # Disable horizontal scrolling
+ fixedHeader = FALSE, # Disable fixed header
+ columnDefs = list(
+ # Hide columns not in 'viewing_cols'
+ list(visible = FALSE, targets = hide_cols)
+ )
+ )
+ )
+
+ return(dt)
+}
+
+# Function to generate query data table
+getQueryDataTable <- function(query_data, query_select = NULL) {
+
+ # Check if analysis is loaded and data is available
+ if (nrow(query_data@df) == 0) {
+ stop("No data available. Please ensure you have uploaded your data
+ correctly.")
+ }
+
+ # Define the columns to be shown
+ viewing_cols <- c("QueryName", "Species", "Lineage", "DomArch.Pfam")
+
+ # If no specific queries are selected, display all data
+ if (is.null(query_select)) {
+ d <- query_data@df
+ } else {
+ # Filter the data for selected queries
+ d <- query_data@df %>% filter(QueryName %in% query_select)
+ d <- droplevels(d)
+ }
+
+ # Identify columns to hide (those not in viewing_cols)
+ hide_cols <- which(!colnames(d) %in% viewing_cols) - 1
+
+ # Add hyperlinks to the "Query" column (replace 'Query' with the actual
+ # column name if different)
+ d$Query <- paste0("", d$Query, "")
+
+ # Create and return the DataTable
+ dt <- DT::datatable(d,
+ rownames = FALSE,
+ filter = "top", # Enable filtering
+ extensions = c("Buttons"), # Enable buttons (e.g., CSV export)
+ callback = DT::JS(c(
+ '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );'
+ )),
+ escape = FALSE, # Allow HTML rendering (for the hyperlinks)
+ options = list(
+ dom = "frlBtip", # Layout for the table (filters, buttons, etc.)
+ pageLength = 25, # Number of rows per page
+ paging = TRUE, # Enable pagination
+ # Enable regex search
+ search = list(caseInsensitive = TRUE, regex = TRUE),
+ language = list(
+ searchPlaceholder = "Regex or Text..."
+ ),
+ buttons = list(
+ list(
+ extend = "colvis",
+ text = "Add/remove column(s)"
+ ),
+ list(
+ extend = "csv",
+ text = "Download",
+ filename = "molevolvr-query",
+ exportOptions = list(
+ # Export all data, not just visible page
+ modifier = list(page = "all")
+ )
+ )
+ ),
+ scrollX = FALSE, # Disable horizontal scrolling
+ fixedHeader = FALSE, # Disable fixed header
+ columnDefs = list(
+ # Hide columns not in 'viewing_cols'
+ list(visible = FALSE, targets = hide_cols)
+ )
+ )
+ )
+
+ return(dt)
+}
+
+# Function to read and return the FASTA file contents
+readFastaData <- function(fasta_path) {
+
+ # Check if analysis is loaded and the file path is not empty
+ if (fasta_path == "" || !file.exists(fasta_path)) {
+ stop("FASTA file path is invalid or the file does not exist.")
+ }
+
+ # Read the content of the FASTA file
+ fasta_content <- read_file(fasta_path)
+
+ return(fasta_content)
+}
+
+getFastaData <- function(fasta_path) {
+ if (is.null(fasta_path) || fasta_path == "") {
+ stop("Error: FASTA path is not provided.")
+ }
+ return(read_file(fasta_path)) # Read and return the FASTA data
+}
+
+# Function to get domain sequences (assumes `data` is a predefined object)
+getDomData <- function(data) {
+ return(data@domainSeqs) # Return domain sequences
+}
+
+# Function to get MSA data from a given path
+getMSAData <- function(msa_path) {
+ if (is.null(msa_path) || msa_path == "") {
+ stop("Error: MSA path is not provided.")
+ }
+ # Attempt to read the file and handle potential errors
+ if (file.exists(msa_path)) {
+ return(read_file(msa_path))
+ } else {
+ warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path))
+ return(NULL) # Return NULL if the file cannot be read
+ }
+}
+
+# Function to generate a heatmap
+getQueryHeatmap <- function(query_data_df,
+ heatmap_select = "All",
+ heatmap_color = "blue") {
+
+ # Check if analysis is loaded and query data exists
+ if (nrow(query_data_df) == 0) {
+ stop("No query data available.")
+ }
+
+ # Filter queries based on user selection
+ if (heatmap_select == "All") {
+ queries <- unique(query_data_df$QueryName)
+ prot <- query_data_df %>%
+ filter(Lineage != "") %>%
+ tidyr::drop_na(Lineage)
+ } else {
+ queries <- heatmap_select
+ prot <- query_data_df %>%
+ filter(grepl(heatmap_select, QueryName, ignore.case = TRUE)) %>%
+ filter(Lineage != "") %>%
+ tidyr::drop_na(Lineage)
+ }
+
+ # Validate that the Lineage column has values
+ if (all(unique(prot$Lineage) == "")) {
+ stop("Lineage column not found for selected proteins.
+ See the FAQ for possible reasons/solutions.")
+ }
+
+ plotLineageQuery(prot, queries = queries, colname = "QueryName",
+ cutoff = 100, color = heatmap_color)
+}
+
+# Function to retrieve domain architecture columns
+getQueryDomArchCols <- function(query_data_df) {
+ # Check if query data exists
+ if (nrow(query_data_df) == 0) {
+ stop("No query data available.")
+ }
+
+ # Get column names
+ cols <- colnames(query_data_df)
+
+ # Filter domain architecture columns, excluding repeats
+ domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)]
+
+ # Identify columns that are completely NA
+ na_cols <- names(query_data_df)[apply(query_data_df, 2,
+ function(x) all(is.na(x)))]
+
+ # Remove NA columns from domain architecture columns
+ domarch_cols <- setdiff(domarch_cols, na_cols)
+
+ # Include SignalP_GRAM_POSITIVE if present
+ if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) {
+ domarch_cols <- setdiff(domarch_cols, "SignalP_GRAM_POSITIVE") # Remove first
+ domarch_cols <- append(domarch_cols, "SignalP_GRAM_POSITIVE") # Append it last
+ }
+
+ # Remove prefix from column names
+ domarch_cols <- substring(domarch_cols, first = 9)
+
+ return(domarch_cols)
+}
+
+# Function to generate main data table
+getMainTable <- function(data_df, main_select = NULL) {
+ # Validate input data
+ if (nrow(data_df) == 0) {
+ stop("No data available. Please ensure you have uploaded your data
+ correctly. See Help documentation or contact JRaviLab
+ (janani.ravi[AT]cuanschutz[DOT]edu).")
+ }
+
+ # Define columns to view
+ viewing_cols <- c("AccNum", "QueryName", "Name", "Lineage",
+ "Species", "Length", "PcPositive",
+ "DomArch.Pfam")
+
+ # Filter data based on selection
+ if (!is.null(main_select)) {
+ d <- data_df %>%
+ filter(QueryName %in% main_select) %>%
+ droplevels()
+ } else {
+ d <- data_df
+ }
+
+ # Identify columns to hide
+ hide_cols <- which(!colnames(d) %in% viewing_cols)
+ hide_cols <- hide_cols - 1
+
+ # Create hyperlinks for AccNum
+ d$AccNum <- paste0("", d$AccNum, "")
+
+ # Generate DataTable
+ datatable_output <- DT::datatable(d,
+ rownames = FALSE,
+ filter = "top",
+ callback = DT::JS(c(
+ '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );'
+ )),
+ extensions = c("Buttons"),
+ escape = FALSE,
+ options = list(
+ dom = "frlBtip",
+ pageLength = 25,
+ paging = TRUE,
+ search = list(caseInsensitive = TRUE, regex = TRUE),
+ language = list(searchPlaceholder = "Regex or Text...",
+ filterPlaceholder = "Test"),
+ buttons = list(
+ list(extend = "colvis", text = "Add/remove column(s)"),
+ list(
+ extend = "csv",
+ text = "Download",
+ filename = "molevolvr-homolog",
+ exportOptions = list(modifier = list(page = "all"))
+ )
+ ),
+ scrollX = FALSE,
+ fixedHeader = FALSE,
+ columnDefs = list(list(visible = FALSE, targets = hide_cols))
+ )
+ )
+
+ return(datatable_output)
+}
+
+totalCounts <- function(prot, column = "DomArch", lineage_col = "Lineage",
+ cutoff = 90, RowsCutoff = FALSE, digits = 2
+ # type = "GC"
+) {
+ # column <- sym(column)
+
+ prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
+ filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
+ filter({{ column }} != "")
+
+ prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
+ col_count <- prot %>%
+ group_by(!!sym(column)) %>%
+ summarise(totalcount = sum(count))
+
+ total <- left_join(prot, col_count, by = column)
+
+ sum_count <- sum(total$count)
+ total <- total %>%
+ mutate("IndividualCountPercent" = totalcount / sum_count * 100) %>%
+ arrange(-totalcount, -count)
+
+ cumm_percent <- total %>%
+ select({{ column }}, totalcount) %>%
+ distinct() %>%
+ mutate("CumulativePercent" = 0)
+ total_counter <- 0
+ for (x in length(cumm_percent$totalcount):1) {
+ total_counter <- total_counter + cumm_percent$totalcount[x]
+ cumm_percent$CumulativePercent[x] <- total_counter / sum_count * 100
+ }
+
+ cumm_percent <- cumm_percent %>% select(CumulativePercent, {{ column }})
+
+ total <- total %>% left_join(cumm_percent, by = as_string(column))
+
+ # Round the percentage columns
+ total$CumulativePercent <- total$CumulativePercent %>% round(digits = digits)
+ total$IndividualCountPercent <- total$IndividualCountPercent %>% round(digits = digits)
+
+ if (RowsCutoff) {
+ # If total counts is being used for plotting based on number of rows,
+ # don't include other observations that fall below the cummulative percent cutoff
+ # , but that have the same 'totalcount' number as the cutoff observation
+ total <- total %>% filter(CumulativePercent >= 100 - cutoff - .0001)
+ return(total)
+ }
+
+ # Include observations that fall below the cummulative percent cutoff,
+ # but that have the same 'totalcount' as the cutoff observation
+ t <- total %>% filter(CumulativePercent >= 100 - cutoff)
+ if (length(t) == 0) {
+ cutoff_count <- 0
+ } else {
+ cutoff_count <- t$totalcount[nrow(t)]
+ }
+
+ total <- total %>%
+ filter(totalcount >= cutoff_count) %>%
+ ungroup()
+
+ return(total)
+}
+
+getDomArchTotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) {
+ # Check if ipr_path is not empty
+ if (app_data@ipr_path == "") {
+ stop("ipr_path is missing.")
+ }
+
+ # Calculate total counts with the specified cutoff and column
+ prot_tc <- totalCounts(DA_Prot, cutoff = DACutoff, column = DA_col)
+
+ # Replace all instances of ">" in the Lineage column with "_"
+ prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>%
+ unlist()
+
+ # Return the processed data
+ prot_tc
+}
+
+# Function to generate Domain Architecture Linear Table
+getDomArchLinearTable <- function(DA_col, ipr_path, DA_TotalCounts_value) {
+ # Check if ipr_path is valid
+ if (ipr_path == "") {
+ stop("InterPro path is empty.")
+ }
+
+ # Check if DA_col is provided
+ if (is.null(DA_col)) {
+ stop("DA_col input is required.")
+ }
+
+ da_col <- sym(DA_col)
+
+ # Perform the data transformation
+ DA_TotalCounts_value %>%
+ group_by({{ da_col }}, totalcount, CumulativePercent) %>%
+ summarize(LineageCount = n()) %>%
+ select({{ da_col }}, LineageCount, totalcount, CumulativePercent) %>%
+ arrange(-totalcount)
+
+ # Generate the DAlin count table
+ DAlin_table <- DT::datatable(
+ DA_TotalCounts_value,
+ selection = "single",
+ extensions = c("Buttons"),
+ options = list(
+ pageLength = 25,
+ dom = "frlBtip",
+ buttons = list(
+ list(
+ extend = "csv",
+ text = "Download",
+ filename = "MolEvolvR_domarch",
+ exportOptions = list(
+ modifier = list(page = "all")
+ )
+ )
+ ),
+ scrollX = FALSE,
+ paging = TRUE,
+ fixedHeader = FALSE,
+ fixedColumns = list(leftColumns = 0, rightColumns = 0)
+ )
+ )
+
+ return(DAlin_table)
+}
+
+# Function to generate the Domain Architecture Lineage Plot
+getDomArchHeatmapPlot <- function(DA_col, DACutoff,
+ DA_Prot, DA_lin_color,
+ ipr_path) {
+ # Check if ipr_path is valid
+ if (ipr_path == "") {
+ stop("InterPro path is empty.")
+ }
+
+ # Filter the protein data for plotting
+ prot <- DA_Prot %>%
+ filter(Lineage != "") %>%
+ drop_na(Lineage)
+
+ # Create the plot
+ plot <- plotLineageDA(
+ prot,
+ colname = DA_col,
+ cutoff = DACutoff,
+ RowsCutoff = FALSE,
+ color = DA_lin_color
+ )
+
+ # Convert ggplot to plotly object
+ plot <- plotly::ggplotly(plot)
+ plot <- plot %>% plotly::layout(xaxis = list(side = "top"))
+
+ return(plot)
+}
+
+# Function to generate the Domain Architecture Network
+getDomNetwork <- function(DA_col, DACutoff, DA_Prot,
+ networkLayout, ipr_path) {
+ # Check if ipr_path is valid
+ if (ipr_path == "") {
+ stop("InterPro path is empty.")
+ }
+
+ # Prepare the selected protein data
+ dn_data <- DA_Prot
+ col <- sym(DA_col) # Convert DA_col to a symbol for use with dplyr
+ dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_")
+
+ # Filter data based on the specified column
+ dn_data <- dn_data %>%
+ drop_na(!!col) %>%
+ filter(!!col != "")
+
+ # Generate the domain network
+ res <- createDomainNetwork(
+ prot = dn_data,
+ column = DA_col,
+ domains_of_interest = ".*",
+ cutoff = DACutoff,
+ layout = networkLayout
+ )
+
+ # Validate the result
+ if (any(res == "error")) {
+ stop("Not enough nodes to construct a network.
+ Try increasing 'Total Count Cutoff'.")
+ }
+
+ # Add export functionality
+ visNetwork::addExport(res)
+ res <- res %>% visExport(
+ type = "png",
+ name = "domain_architecture-network",
+ float = "left",
+ label = "Download plot",
+ style = "
+ background-color: #ffffff;
+ color: #333333;
+ border-color: #cccccc;
+ padding: 6px 12px;
+ font-size: 14px;
+ line-height: 1.42857143;
+ border-radius: 4px;
+ cursor: pointer;
+ display: inline-block;
+ margin-bottom: 0;
+ font-weight: normal;
+ text-align: center;
+ vertical-align: middle;
+ touch-action: manipulation;
+ user-select: none;
+ background-image: none;
+ text-decoration: none;
+ "
+ )
+
+ return(res)
+}
+
+# Function to retrieve and clean Domain Architecture data
+getDomArchProt <- function(app_data, DASelect) {
+ # Check if the ipr_path is valid
+ if (app_data@ipr_path == "") {
+ stop("InterPro path is empty.")
+ }
+
+ # Validate domain architecture
+ # Check if ipr_path is not empty
+ if (app_data@ipr_path == "") {
+ stop("ipr_path is missing.")
+ }
+
+ # Check if the data frame has rows
+ if (nrow(app_data@df) <= 0) {
+ stop("No data available. Please ensure you have uploaded your data correctly. See Help documentation or contact JRaviLab (janani.ravi[AT]cuanschutz[DOT]edu).")
+ }
+
+ # Check if there are domain architecture columns
+ # if (length(domarch_cols) == 0) {
+ # stop("Please ensure uploaded data has domain architecture columns.")
+ # }
+
+ # Retrieve app data and clean domain architecture columns
+ df_app_data <- app_data@df
+
+ # Domain architecture column cleanup
+ df_app_data <- df_app_data %>%
+ mutate(across(starts_with("DomArch"), cleanString))
+
+ # Filter based on user selection
+ if (DASelect == "All") {
+ return(df_app_data)
+ } else {
+ return(df_app_data %>% filter(grepl(DASelect, QueryName,
+ ignore.case = TRUE)))
+ }
+}
+
+# Function to retrieve domain architecture columns
+getDomArchCols <- function(app_data, DASelect) {
+ # Check if app data DataFrame is not empty
+ if (nrow(app_data@df) <= 0) {
+ stop("No data available in app data.")
+ }
+
+ # Check if ipr_path is valid
+ if (app_data@ipr_path == "") {
+ stop("InterPro path is empty.")
+ }
+
+ # Get column names
+ cols <- colnames(app_data@df)
+
+ # Filter domain architecture columns
+ domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)]
+
+ # Retrieve the data frame
+ domarch_data <- app_data@df
+
+ # Filter data frame based on user selection
+ if (DASelect != "All") {
+ domarch_data <- domarch_data %>% filter(QueryName == DASelect)
+ }
+
+ # Identify columns that are completely NA or empty
+ na_cols <- apply(domarch_data, 2, function(x) {
+ all(is.na(x)) || all(x == "")
+ })
+ na_cols <- names(domarch_data)[na_cols]
+
+ # Remove NA columns from the domain architecture columns
+ domarch_cols <- setdiff(domarch_cols, na_cols)
+
+ # Ensure "SignalP_GRAM_POSITIVE" is included if it exists
+ if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) {
+ domarch_cols <- append(domarch_cols[!domarch_cols %in%
+ "SignalP_GRAM_POSITIVE"],
+ "SignalP_GRAM_POSITIVE")
+ }
+
+ return(domarch_cols)
+}
+
+# Function to generate the IPR genes plot
+getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases,
+ query_iprVisType, DASelect) {
+
+ if (app_data@ipr_path == "") {
+ stop("IPR path is not set.")
+ }
+
+ # Validate the data frame
+ df <- app_data@df
+ if (nrow(df) == 0) {
+ stop("No data available. Please ensure you have uploaded
+ your data correctly.")
+ }
+
+ if (is.null(query_iprDatabases) || length(query_iprVisType) == 0) {
+ stop("Please select an analysis.")
+ }
+
+ # Determine the name to use
+ if (length(app_data@cln_path) == 0 || is.null(df$PcPositive)) {
+ name_column <- ifelse("name" %in% colnames(df), "name", "AccNum")
+ name_column <- "Name"
+
+ # Generate the plot using the web version
+ plot <- plotIPR2VizWeb(
+ infile_ipr = app_data@ipr_path,
+ accessions = df$Name,
+ analysis = query_iprDatabases,
+ group_by = query_iprVisType,
+ name = name_column
+ )
+ } else {
+ # Generate the plot using the local version
+ plot <- plotIPR2Viz(
+ infile_ipr = app_data@ipr_path,
+ infile_full = df,
+ accessions = unique(df$Name),
+ analysis = query_iprDatabases,
+ group_by = query_iprVisType,
+ topn = 20,
+ query = DASelect
+ )
+ }
+
+ return(plot)
+}
+
+# Function to filter proteins for phylogeny
+filterPhylogenyProteins <- function(app_data, phylo_select) {
+ # Get the data frame from app_data
+ df <- app_data@df
+
+ # Check if the data frame is empty
+ if (nrow(df) == 0) {
+ stop("No data available in app_data.
+ Please ensure it has been loaded correctly.")
+ }
+
+ # Filter the data based on user selection
+ if (phylo_select == "All") {
+ filtered_df <- df
+ } else {
+ filtered_df <- df %>%
+ filter(grepl(phylo_select, QueryName, ignore.case = TRUE)) %>%
+ filter(Lineage != "")
+ }
+
+ return(filtered_df)
+}
+
+# Function to retrieve representative accession numbers
+getRepAccNum <- function(app_data, phylo_select,
+ msa_reduce_by, msa_rep_num,
+ rval_phylo) {
+
+ if (rval_phylo()) {
+ return(app_data@df$AccNum)
+ } else {
+ switch(msa_reduce_by,
+ "Species" = rep_acc_species(),
+ "Lineage" = rep_acc_lineage(),
+ "DomArch" = {
+ if (app_data@ipr_path == "") {
+ stop("IPR path is empty. Please ensure it is set correctly.")
+ }
+ seqs <- find_top_acc(infile_full = app_data@df,
+ n = msa_rep_num,
+ query = phylo_select)
+ if (is.null(seqs)) {
+ stop("No sequences found.")
+ }
+ return(seqs)
+ },
+ "Full" = {
+ tmp <- app_data@df %>% filter(QueryName == phylo_select)
+ return(tmp$AccNum)
+ },
+ stop("Invalid selection for msa_reduce_by.")
+ )
+ }
+}
+
+getDomArchPlot <- function(ipr_path, query_names,
+ analysis_type, group_by) {
+ # Check if the input path is provided
+ if (is.null(ipr_path) || ipr_path == "") {
+ stop("Error: Input path for IPR data is not provided.")
+ }
+
+ # Validate that at least one domain is found
+ if (length(query_domarch_cols()) < 1) {
+ stop("Error: No domains found in the input sequences.")
+ }
+
+ n <- "Name"
+ plot <- plotIPR2VizWeb(
+ infile_ipr = ipr_path,
+ accessions = query_names,
+ analysis = analysis_type,
+ group_by = group_by,
+ name = n
+ )
+
+ return(plot) # Return the generated plot
+}
+
+# Function to convert accessions to names
+acc2Name <- function(app_data) {
+ # Check if "AccNum" is a column in the data
+ if (!("AccNum" %in% colnames(app_data@df))) {
+ stop("Column 'AccNum' not found in data.")
+ }
+
+ # Check if "Name" column exists and create the output data frame
+ if ("Name" %in% colnames(app_data@df)) {
+ # Select "AccNum" and "Name"
+ df <- select(app_data@df, "AccNum", "Name")
+ } else {
+ # Create a data frame with "AccNum" and assign "AccNum" to "Name"
+ df <- app_data@df %>%
+ select("AccNum") %>%
+ mutate(Name = AccNum)
+ }
+
+ return(df)
+}
+
+repAccNums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) {
+ # If `phylo` is true, return all `AccNum` values from `app_data`
+ if (phylo) {
+ return(app_data@df$AccNum)
+ } else {
+ # Switch based on `msa_reduce_by` value
+ tmp <- filter(app_data@df)
+ rep_acc_species <- createRepresentativeAccNum(tmp, reduced = "Species", accnum_col = "AccNum")
+
+ # Get representative accession numbers by "Lineage"
+ rep_acc_lineage <- createRepresentativeAccNum(tmp, reduced = "Lineage", accnum_col = "AccNum")
+ switch(msa_reduce_by,
+ "Species" = rep_acc_species,
+ "Lineage" = rep_acc_lineage,
+ "DomArch" = {
+ # Check if `ipr_path` is not empty
+ if (app_data@ipr_path == "") {
+ stop("ipr_path is missing.")
+ }
+ # Find top sequences by accession number based on criteria
+ seqs <- getTopAccByLinDomArch(infile_full = app_data@df, n = msa_rep_num, query = PhyloSelect)
+ if (is.null(seqs)) {
+ stop("No sequences found.")
+ }
+ return(seqs)
+ },
+ "Full" = {
+ # Filter data for matching `QueryName` and return `AccNum`
+ tmp <- app_data@df %>% filter(QueryName == PhyloSelect)
+ return(tmp$AccNum)
+ }
+ )
+ }
+}
+
+seqTree <- function(fasta_filepath){
+ my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T)
+ my_seqs_msa <- msa(my_seqs)
+ my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment")
+
+ #below was commented out, does it need to change as one of the parameters? the bottom keeps
+ d <- dist.alignment(my_seqs_msa_aln, "identity")
+ #as.matrix(d)[2:5, "HBA1_Homo_sapiens", drop=FALSE]
+
+ ## Phylogenetic tree
+ ## using package ape
+ ## build neighbor-joining tree
+ seqTree <- nj(d)
+ #plot(seqTree, main="Phylogenetic Tree of MSA")
+ groupInfo <- split(seqTree$tip.label,
+ gsub("_\\w+", "", seqTree$tip.label))
+ seqTree <- groupOTU(seqTree, groupInfo)
+ # ggtree(seqTree, aes(color=group),
+ # layout='circular') +
+ # geom_tiplab(size=1, aes(angle=angle))
+ #https://yulab-smu.top/treedata-book/chapter4.html
+ #offs <- 0
+ tree <- ggtree(seqTree, branch.length = "dN_vs_dS") + theme_tree2(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
+ msaplot(tree, fasta=fasta_filepath, offset=0.5, bg_line = TRUE) + geom_tiplab(align=TRUE, linesize=0.5, size=3)
+
+}
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
new file mode 100644
index 00000000..ab02bdeb
--- /dev/null
+++ b/inst/report/report_template.Rmd
@@ -0,0 +1,497 @@
+---
+title: MolEvolvR
+output:
+ html_document:
+ self_contained: yes
+params:
+ rs_interproscan_visualization: default
+ proximity_network: default
+ sunburst: default
+ data: default
+ queryDataTable: default
+ fastaDataText: default
+ heatmap: default
+ query_data: default
+ query_domarch_cols: default
+ query_iprDatabases: default
+ query_iprVisType: default
+ mainTable: default
+ DALinTable: default
+ DALinPlot: default
+ DANetwork: default
+ DA_Prot: default
+ domarch_cols: default
+ DA_Col: default
+ DACutoff: default
+ da_interproscan_visualization: default
+ phylo_sunburst_levels: default
+ phylo_sunburst: default
+ lineage_table: default
+ ## Tree
+ tree_msa_tool: default ##input
+ ## MSA
+ repAccNums: default
+ msa_rep_num: default ##input
+ app_data: default
+ PhyloSelect: default ##input
+ acc2Name: default
+ rval_phylo: default
+ query_pin: default
+ msa_reduce_by: default
+---
+
+---
+subtitle: "`r glue::glue('Retrieval Code: {params$query_pin}')`"
+---
+
+
+
+
+
+```{r mine,results="asis",engine="js", echo=FALSE}
+// stumble through the tabset(s)
+setTimeout(function(){
+ document.querySelector('a#FullResultButton1').addEventListener('click', doIt);
+ function doIt(){
+ document.querySelector('a[href*="#domain-architecture-2"]').click(); //Second(-2) Domain Architecture Section (reuse at different heading levels)
+ }
+}, 100); // slow your roll
+
+setTimeout(function(){
+ document.querySelector('a#FullResultButton2').addEventListener('click', doIt);
+ function doIt(){
+ document.querySelector('a[href*="#phylogeny-1"]').click(); //Second Phylogeny(-1) Section (reuse at different heading levels)
+ }
+}, 100);
+
+setTimeout(function(){
+ document.querySelector('a#FullResultButton3').addEventListener('click', doIt);
+ function doIt(){
+ document.querySelector('a[href*="#homolog-data"]').click(); //Homolog Data Section
+ }
+}, 100);
+```
+
+```{r setup, include = FALSE}
+## Globally Suppress warnings and messaging
+knitr::opts_chunk$set(warning = FALSE, message = FALSE)
+## Create Sunburst Report Function
+cpcols <- c(
+ "#AFEEEE", "#DDA0DD", "#EE2C2C", "#CDBE70", "#B0B099",
+ "#8B2323", "#EE7600", "#EEC900", "chartreuse3", "#0000FF",
+ "#FFD900", "#32CD32", "maroon4", "cornflowerblue", "darkslateblue",
+ "#AB82FF", "#CD6889", "#FFA07A", "#FFFF00", "#228B22",
+ "#FFFFE0", "#FFEC8B", "peru", "#668B8B", "honeydew",
+ "#A020F0", "grey", "#8B4513", "#191970", "#00FF7F",
+ "lemonchiffon", "#66CDAA", "#5F9EA0", "#A2CD5A", "#556B2F",
+ "#EEAEEE", "thistle4", "#473C8B", "#FFB6C1", "#8B1C62",
+ "#FFE4B5", "black", "#FF7F50", "#FFB90F", "#FF69B4", "#836FFF",
+ "#757575", "#CD3333", "#EE7600", "#CDAD00", "#556B2F", "#7AC5CD"
+ )
+lineage_sunburst_report <- function(prot, lineage_column = "Lineage",
+ type = "sunburst",
+ levels = 2, colors = NULL, legendOrder = NULL, showLegend = TRUE, maxLevels = 5) {
+ lin_col <- sym(lineage_column)
+
+ # ensure they don't exceed maxLevels, although this should
+ if (!is.null(maxLevels) && levels > maxLevels) {
+ levels <- maxLevels
+ }
+
+ levels_vec <- c()
+ for (i in 1:levels)
+ {
+ levels_vec <- append(levels_vec, paste0("level", i))
+ }
+
+ # Take lineage column and break into the first to levels
+ prot <- prot %>%
+ select({{ lin_col }}) %>%
+ arrange(desc({{ lin_col }})) %>%
+ drop_na({{ lin_col }})
+ protLevels <- prot %>% separate({{ lin_col }}, into = levels_vec, sep = ">")
+ # Count the occurrance of each group of levels
+ protLevels <- protLevels %>%
+ group_by_at(levels_vec) %>%
+ summarise(size = n())
+ protLevels <- protLevels %>% arrange()
+ tree <- d3_nest(protLevels, value_cols = "size")
+
+ # Plot sunburst
+ if (type == "sunburst") {
+ result <- sunburst(tree, legend = list(w = 225, h = 15, r = 5, s = 5), colors = cpcols, legendOrder = legendOrder)
+ } else if (type == "sund2b") {
+ result <- sund2b(tree)
+ }
+
+ if (showLegend) {
+ return(
+ htmlwidgets::onRender(
+ result,
+ "function(el, x) {
+ jQuery('.sunburst-togglelegend', el)
+ .attr('checked', 'true')
+ .attr('data-html2canvas-ignore', 'true');
+ jQuery('.sunburst-legend', el).css('visibility', '');
+
+ // create a button to download the sunburst
+ // (relies on html2canvas being included in the page)
+
+ // FIXME: consider pulling this all out into a js library
+ // so that we can apply it to other components.
+
+ const downloadBtn = jQuery('')
+ .css({'position': 'absolute', 'right': '5px', 'top': '5px'})
+ .attr('data-html2canvas-ignore', 'true')
+ .appendTo(el);
+
+ saveAs = (blob, fileName) => {
+ const link = document.createElement('a');
+ link.download = fileName
+ link.href = URL.createObjectURL(blob);
+ link.click();
+ URL.revokeObjectURL(link.href);
+ }
+
+ downloadBtn.click(() => {
+ html2canvas(el, { scale: 4, logging: false }).then(canvas => {
+ canvas.toBlob(function(blob) {
+ saveAs(blob, 'sunburst.png');
+ });
+ });
+ });
+ }"
+ )
+ )
+ }
+
+ return(result)
+}
+```
+
+# {.tabset}
+
+## Results Summary
+
+An overview of the protein analysis. To view the full results, explore the additional tabs.
+
+### Domain Architecture
+
+Visualizations and summaries for protein domains.
+
+View full results
+
+
+#### Interproscan Visualization
+
+```{r, echo=FALSE}
+params$rs_interproscan_visualization
+```
+
+#### Proximity Network
+
+```{r, echo=FALSE}
+params$proximity_network
+```
+
+### Phylogeny
+
+Visualizations for protein evolution.
+
+View full results
+
+#### Sunburst
+
+```{r, echo=FALSE}
+library(dplyr)
+library(tidyr)
+lineage_sunburst_report(params$sunburst, "Lineage", levels = 2)
+```
+
+### Data
+
+Summary table of proteins including domain architectures, phylogeny, and homologs, when applicable.
+
+View full results
+
+```{r, echo=FALSE}
+params$data
+```
+
+
+## Query Data {.tabset}
+
+Input data, additional metadata, and preliminary analyses of query protein(s).
+
+### Data Table
+
+The data table provides a summary of the sequences submitted, or \"queried\", for analysis. The preview shown can be extended by using \"Add/remove column(s)\" to see info about other taxonomic classes as well as domain architecuture codes from databases other than the default (Pfam).
+
+```{r, echo=FALSE}
+params$queryDataTable
+```
+
+### FASTA
+
+Uploaded amino acid FASTA sequence(s).
+
+`r glue::glue(params$fastaDataText)`
+
+### Query Heatmap
+
+A heatmap of submitted sequences and their respective taxonomic lineages.
+
+```{r, echo=FALSE}
+params$heatmap
+```
+
+### Domain Architecture
+
+Visualizations and analyses of all query and homologous protein domains, structural or functional subunits, and their architectures.
+
+```{r, echo=FALSE}
+if(is.null(params$query_iprDatabases)){
+ choices <- params$query_domarch_cols
+ if ("PfamA" %in% choices & "Phobius" %in% choices) {
+ analysis_type <- c("PfamA", "Phobius")
+ } else {
+ default <- choices[1]
+ }
+ } else {
+ analysis_type <- params$query_iprDatabases
+ }
+
+if(is.null(params$query_iprVisType)){
+ analysis_group <- 'Analysis'
+ } else {
+ analysis_group <- params$query_iprVisType
+ }
+
+if(length(params$query_domarch_cols) >= 1 ) {
+ DomArchPlot <- plotIPR2VizWeb(
+ infile_ipr = params$query_data@ipr_path,
+ accessions = params$query_data@df$QueryName,
+ analysis = analysis_type, group_by = analysis_group, name = "Name"
+ )
+ DomArchPlot
+} else {
+ "No domains found in the input sequences."
+ }
+```
+
+## Homolog Data
+
+Full set of homologs of query sequences, including their lineage and domain architecture info.
+
+```{r, echo=FALSE}
+params$mainTable
+```
+
+## Domain Architecture {.tabset}
+
+Summary and visualizations of protein motifs/subunits (domains) and their configurations within the query protein(s) (domain architectures).
+
+### Table
+
+```{r, echo=FALSE}
+params$DALinTable
+```
+
+### Heatmap
+
+```{r, echo=FALSE}
+params$DALinPlot
+```
+
+### Network
+
+```{r, echo=FALSE}
+params$DANetwork
+```
+
+```{r, echo=FALSE}
+if (length(params$domarch_cols) == 0) {
+ choices <- c("None")
+ vals <- "None"
+ selected <- "None"
+ } else {
+ vals <- params$domarch_cols
+ choices <- substring(vals, first = 9)
+ selected <- if (length(choices) >= 2) choices[1:2] else choices[1]
+ }
+
+if (is.null(params$DA_col)) {
+ da_col <- vals[[1]]
+ } else {
+ da_col <- params$DA_Col
+ }
+
+plot_data <- params$DA_Prot
+plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_")
+createWordCloudElement(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F)
+```
+
+### Interproscan Visualization
+
+```{r, echo=FALSE}
+params$da_interproscan_visualization
+```
+
+### UpSet Plot
+
+```{r, echo=FALSE}
+upset_plot_data <- params$DA_Prot
+upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_")
+final_plot <- plotUpSet(upset_plot_data, colname = da_col, cutoff = params$DACutoff)
+domains <- upset_plot_data %>%
+ dplyr::pull(da_col)
+n_unique_domains <- domains %>%
+ unique() %>%
+ length()
+
+if (n_unique_domains > 1) {
+ final_plot
+ } else {
+ stringr::str_glue("UpSet plot requires more than 1 unique domain (only {n_unique_domains} present). Try selecting more proteins.")
+ }
+```
+
+## Phylogeny {.tabset}
+
+Visualizations of phyletic patterns, sequence similarity, and evolution of related proteins.
+
+### Sunburst
+
+```{r, echo=FALSE}
+lineage_sunburst_report(params$phylo_sunburst, lineage_column = "Lineage_long_na", type = "sunburst", levels = params$phylo_sunburst_levels)
+```
+
+### Tree
+
+```{r, include=FALSE}
+if (is.null(params$tree_msa_tool)) {
+ tree_msa_tool <- 'ClustalO' ## initialize with default value
+ } else {
+ tree_msa_tool <- params$tree_msa_tool
+ }
+
+if (is.null(params$msa_rep_num)) {
+ msa_rep_num <- 3 ## initialize with default value
+ } else {
+ msa_rep_num <- params$msa_rep_num
+ }
+```
+
+```{r, include=FALSE}
+if(length(params$repAccNums) >= 3 ) {
+ rep <- params$repAccNums[1:msa_rep_num]
+ if (!params$rval_phylo) {
+ seqs <- readAAStringSet(params$app_data@fasta_path)
+ names(seqs) <- sub(" .*", "", names(seqs))
+ query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
+ query_accession <- unique(query_accession$Query)
+ query <- seqs[query_accession]
+ names(query) <- unique(params$app_data@df$QueryName)
+ query <- AAStringSet(query)
+ }
+ # Generate Fasta File
+ rep_fasta_path <- tempfile()
+
+ acc2FA(rep, outpath = rep_fasta_path, "sequential")
+ rename_fasta(rep_fasta_path, rep_fasta_path,
+ replacement_function = mapAcc2Name,
+ acc2name = params$acc2Name
+ )
+ if (!params$rval_phylo) {
+ writeXStringSet(query, rep_fasta_path, append = TRUE)
+ }
+ rep_msa_path <- tempfile()
+ alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path)
+}
+```
+
+```{r, echo=FALSE}
+library(msa)
+library(ape)
+library(tidytree)
+library(ggtree)
+if(length(params$repAccNums) >= 3 ) {
+ seqTree(fasta_filepath = rep_msa_path)
+ } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
+```
+
+### MSA
+
+```{r, include=FALSE}
+if(length(params$repAccNums) >= 3 ) {
+ if (!params$rval_phylo) {
+ seqs <- readAAStringSet(params$app_data@fasta_path)
+ names(seqs) <- sub(" .*", "", names(seqs))
+ query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
+ query_accession <- unique(query_accession$Query)
+ query <- seqs[query_accession]
+ # names(query) <- params$PhyloSelect
+ query <- AAStringSet(query)
+ }
+
+ # Generate Fasta File
+ rep_fasta_path <- tempfile()
+ acc2FA(rep, outpath = rep_fasta_path, "sequential")
+ rename_fasta(rep_fasta_path, rep_fasta_path,
+ replacement_function = mapAcc2Name,
+ acc2name = params$acc2Name
+ )
+ if (!params$rval_phylo) {
+ writeXStringSet(query, rep_fasta_path, append = TRUE)
+ }
+
+ # Call MSA2PDF
+ msa_pdf_path <- tempfile()
+ msa_parent_dir <- dirname(params$app_data@fasta_path)
+ msa_subdir_path <- file.path(msa_parent_dir, "msa_figs")
+ if (!dir.exists(msa_subdir_path)) {
+ dir.create(msa_subdir_path, recursive = TRUE)
+ message("Subdirectory 'msa_figs' created at: ", msa_subdir_path)
+ } else {
+ message("Subdirectory 'msa_figs' already exists at: ", msa_subdir_path)
+ }
+ msa_prefix <- paste0(msa_subdir_path, "/")
+ post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_")
+
+ msa_pdf_path <- paste0(msa_prefix, post_fix)
+
+ createMSA_PDF(fasta_path = rep_fasta_path, msa_pdf_path)
+ }
+```
+
+```{r msa_pdf, echo = FALSE, out.width = "95%", out.height = "800px"}
+# apt-get install texlive
+if(length(params$repAccNums) >= 3 ) {
+ knitr::include_graphics(msa_pdf_path)
+ } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
+```
diff --git a/man/combineFiles.Rd b/man/combineFiles.Rd
index 81464fa6..afbc6c5f 100644
--- a/man/combineFiles.Rd
+++ b/man/combineFiles.Rd
@@ -20,7 +20,7 @@ The search is recursive (i.e., it will look in subdirectories as well).}
Default is "*full_analysis.tsv".}
\item{delim}{Character. The delimiter used in the input files.
-Default is tab ("\t").}
+Default is tab \code{"\t"}.}
\item{skip}{Integer. The number of lines to skip at the beginning of each file.
Default is 0.}
diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd
index 0eeb0610..9e91d606 100644
--- a/man/getTopAccByLinDomArch.Rd
+++ b/man/getTopAccByLinDomArch.Rd
@@ -6,7 +6,7 @@
\usage{
getTopAccByLinDomArch(
infile_full,
- DA_col = "DomArch.Pfam",
+ DA_col = "DomArch.PfamA",
lin_col = "Lineage_short",
n = 20,
query