diff --git a/.gitignore b/.gitignore
index ef11006e..74dbc757 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .Rproj.user
 docs
+common_data
 .Rhistory
-.DS_Store
\ No newline at end of file
+.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
index 0fa9a949..a30025de 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -40,6 +40,7 @@ Imports:
     DBI,
     doParallel,
     dplyr,
+    DT,
     forcats,
     fs,
     furrr,
@@ -51,6 +52,8 @@ Imports:
     here,
     htmltools,
     htmlwidgets,
+    httr,
+    httr2,
     igraph,
     latexpdf,
     magrittr,
@@ -73,6 +76,7 @@ Imports:
     tibble,
     tidyr,
     tinytex,
+    tidyverse,
     tools,
     UpSetR,
     viridis,
@@ -82,4 +86,5 @@ Imports:
     XVector,
     yaml
 Suggests:
-    knitr
+    knitr,
+    iprscanr
diff --git a/NAMESPACE b/NAMESPACE
index 6ae464f6..a5a44a79 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -49,6 +49,7 @@ export(findParalogs)
 export(formatJobArgumentsHTML)
 export(generateAllAlignments2FA)
 export(getAccNumFromFA)
+export(getCaseStudyReport)
 export(getProcessRuntimeWeights)
 export(getTopAccByLinDomArch)
 export(mapAcc2Name)
@@ -98,6 +99,18 @@ export(wordcloud3)
 export(writeMSA_AA2FA)
 export(writeProcessRuntime2TSV)
 export(writeProcessRuntime2YML)
+import(Biostrings)
+import(DT)
+import(data.table)
+import(dplyr)
+import(httr)
+import(iprscanr)
+import(plotly)
+import(readr)
+import(rentrez)
+import(stringr)
+import(tidyverse)
+import(visNetwork)
 importFrom(Biostrings,AAStringSet)
 importFrom(Biostrings,readAAStringSet)
 importFrom(Biostrings,toString)
diff --git a/R/MolEvolData_class.R b/R/MolEvolData_class.R
new file mode 100644
index 00000000..ab496048
--- /dev/null
+++ b/R/MolEvolData_class.R
@@ -0,0 +1,719 @@
+# Author(s): Samuel Chen
+# Last modified: 2020
+
+#' @import Biostrings tidyverse
+
+setClass("MolEvolData",
+         slots = list(
+             df = "data.frame",
+             fasta_path = "character",
+             msa_path = "character",
+             ipr_path = "character",
+             queries = "character",
+             cln_path = "character",
+             domainSeqs = "character",
+             domain_ipr_path = "character"
+         )
+)
+setClass("queryData",
+         slots = list(
+             df = "data.frame",
+             fasta_path = "character",
+             msa_path = "character",
+             ipr_path = "character",
+             queries = "character",
+             cln_path = "character"
+         )
+)
+setClass("blastUpload",
+         slots = list(
+             df = "character",
+             seqs = "character"
+         )
+)
+setClass("iprUpload",
+         slots = list(
+             df = "character",
+             seqs = "character"
+         )
+)
+setClass("seqUpload",
+         slots = list(
+             seqs = "character"
+         )
+)
+
+combineFilesNopmap <- function(inpath, pattern, outpath,
+                                 delim = "\t", skip = 0,
+                                 col_names) {
+    source_files <- dir(path = inpath, pattern = pattern, recursive = T)
+
+    source_files_path <- paste0(inpath, source_files)
+
+    dt_list <- map(source_files_path, function(x) {
+        if (!grepl("query", x)) {
+            fread(x, sep = delim, skip = skip, fill = T) # , col.names = col_names)
+        }
+    })
+
+    combined <- rbindlist(dt_list, fill = T)
+
+    fwrite(combined, outpath, sep = "\t")
+
+    return(combined)
+}
+
+
+full_analysis_colnames <- c("AccNum", "QueryName", "STitle", "Species.x",
+                            "TaxID.x", "Lineage", "PcPositive", "PcIdentity",
+                            "AlnLength", "SAccNum", "SAllSeqID", "Mismatch",
+                            "GapOpen", "QStart", "QEnd", "QLength", "SStart",
+                            "SEnd", "SLength", "EValue",
+                            "BitScore", "PcPosOrig",
+                            "Name", "ClusterID", "DomArch.Pfam",
+                            "DomArch.SMART",
+                            "DomArch.CDD", "DomArch.TIGRFAM",
+                            "DomArch.Phobius",
+                            "DomArch.Gene3D", "DomArch.TMHMM",
+                            "DomArch.SignalP_EUK",
+                            "DomArch.SignalP_GRAM_NEGATIVE",
+                            "DomArch.SignalP_GRAM_POSITIVE",
+                            "Description", "Length", "TaxID.y",
+                            "Species.y", "SourceDB", "Completeness")
+
+
+processWrapperDir <- function(path, pinName, type = "full") {
+    if (type == "full") {
+        if (file.exists(paste0(path, "/cln_combined.tsv")) && file.exists(paste0(path, "/ipr_combined.tsv"))) {
+            query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+            query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+            query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+            query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+            query_data$Query <- query_data$AccNum
+            queries <- unique(query_data$Query)
+            cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+            query_data <- query_data %>%
+                mutate(QueryName = Name) %>%
+                select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>%
+                distinct(QueryName, .keep_all = TRUE)
+            query_wrapper_data <- new("queryData",
+                                      df = query_data, fasta_path = query_sequence_path,
+                                      msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path
+            )
+            com_blast_data <- read_tsv(path, "/blast_combined.tsv") %>% arrange(desc(PcPositive))
+            ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
+            queries <- unique(com_blast_data$Query)
+            com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+            com_blast_data <- com_blast_data %>% select(QueryName, everything())
+            wrapper_data <- new("MolEvolData",
+                                df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path,
+                                fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = ""
+            )
+
+            return(list(wrapper_data, query_wrapper_data))
+        } else {
+
+            # Get the query data
+            query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+            query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+            query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+            query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+            query_data$Query <- query_data$AccNum
+            # alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+            queries <- unique(query_data$Query)
+            cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+            query_data <- query_data %>%
+                mutate(QueryName = Name) %>%
+                select(QueryName, Species, contains("Lineage"), contains("DomArch"), Query, Name) %>%
+                distinct(QueryName, .keep_all = TRUE)
+            query_wrapper_data <- new("queryData",
+                                      df = query_data, fasta_path = query_sequence_path,
+                                      msa_path = query_msa_path, ipr_path = query_ipr_path, queries = queries, cln_path = cln_path
+            )
+
+            query_data <- query_data %>% select(Query, QueryName)
+
+            com_blast_path <- paste0(path, "/blast_combined.tsv")
+            ipr_blast_path <- paste0(path, "/ipr_combined.tsv")
+            com_blast_data <- combineFilesNopmap(paste0(path, "/"),
+                                                   pattern = "*.full_analysis.tsv", skip = 0,
+                                                   col_names = c(), outpath = com_blast_path, delim = "\t"
+            )
+
+            ipr_blast_data <- combineFilesNopmap(paste0(path, "/"),
+                                                   pattern = "*.iprscan_cln.tsv", skip = 0,
+                                                   col_names = ipr_colnames, outpath = ipr_blast_path, delim = "\t"
+            )
+            com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+            com_blast_data <- com_blast_data %>% arrange(desc(PcPositive))
+            fwrite(com_blast_data, com_blast_path, sep = "\t")
+
+            queries <- unique(com_blast_data$Query)
+
+            com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+            com_blast_data <- com_blast_data %>% select(QueryName, everything())
+
+            wrapper_data <- new("MolEvolData",
+                                df = com_blast_data, queries = queries, ipr_path = ipr_blast_path, cln_path = com_blast_path,
+                                fasta_path = query_sequence_path, msa_path = query_msa_path, domainSeqs = "", domain_ipr_path = ""
+            )
+
+            return(list(wrapper_data, query_wrapper_data))
+        }
+    } else if (type == "dblast") {
+        query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+        query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+        query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+        query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+        alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+        cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+        query_data <- query_data %>% mutate(Query = AccNum)
+        query_data <- query_data %>%
+            mutate(QueryName = Name) %>%
+            select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+                   Query, Name, AccNum) %>%
+            distinct(QueryName, .keep_all = TRUE)
+        queries <- unique(query_data$Query)
+        query_wrapper_data <- new("queryData",
+                                  df = query_data,
+                                  fasta_path = query_sequence_path,
+                                  msa_path = query_msa_path,
+                                  ipr_path = query_ipr_path,
+                                  queries = queries, cln_path = cln_path
+        )
+        query_data <- query_data %>% select(Query, QueryName)
+        com_blast_path <- paste0(path, "/blast_combined.tsv")
+        com_blast_data <- combineFilesNopmap(paste0(path, "/"),
+                                               pattern = "*.blast.cln.tsv",
+                                               skip = 0,
+                                               col_names = c(),
+                                               outpath = com_blast_path,
+                                               delim = "\t"
+        )
+        com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+        queries <- unique(com_blast_data$Query)
+        com_blast_data$Lineage <- as.factor(com_blast_data$Lineage)
+        com_blast_data <- com_blast_data %>%
+            select(QueryName, everything()) %>%
+            arrange(desc(PcPositive))
+        wrapper_data <- new("MolEvolData",
+                            df = com_blast_data, queries = queries,
+                            ipr_path = "", cln_path = com_blast_path,
+                            fasta_path = query_sequence_path,
+                            msa_path = query_msa_path, domainSeqs = "",
+                            domain_ipr_path = ""
+        )
+        return(list(wrapper_data, query_wrapper_data))
+    } else if (type == "phylo") {
+        query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+        query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+        query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+        cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+        query_data <- query_data %>% mutate(Query = AccNum)
+        query_data <- query_data %>%
+            mutate(QueryName = Name) %>%
+            select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+                   Query, Name, AccNum) %>%
+            distinct(QueryName, .keep_all = TRUE)
+        queries <- unique(query_data$Query)
+        query_wrapper_data <- new("queryData",
+                                  df = query_data,
+                                  fasta_path = query_sequence_path,
+                                  msa_path = query_msa_path,
+                                  ipr_path = query_ipr_path,
+                                  queries = queries, cln_path = cln_path
+        )
+        wrapper_data <- new("MolEvolData",
+                            df = query_data,
+                            fasta_path = query_sequence_path,
+                            msa_path = query_msa_path,
+                            ipr_path = query_ipr_path,
+                            queries = queries, cln_path = cln_path,
+                            domainSeqs = "", domain_ipr_path = ""
+        )
+        return(list(wrapper_data, query_wrapper_data))
+    } else if (type == "da") {
+        query_data <- read_tsv(paste0(path, "/query_data/query_data.full_analysis.tsv"))
+        query_ipr_path <- paste0(path, "/query_data/query_data.iprscan_cln.tsv")
+        query_msa_path <- paste0(path, "/query_data/query_seqs.msa")
+        query_sequence_path <- paste0(path, "/query_data/query_data.all_accnums.fa")
+        alignFasta(query_sequence_path, tool = "ClustalO", outpath = query_msa_path)
+        cln_path <- paste0(path, "/query_data/query_data.full_analysis.tsv")
+        query_data <- query_data %>%
+            mutate(QueryName = Name) %>%
+            select(QueryName, Species, contains("Lineage"), contains("DomArch"),
+                   Query, Name) %>%
+            distinct(QueryName, .keep_all = TRUE)
+        queries <- unique(query_data$Query)
+        query_wrapper_data <- new("queryData",
+                                  df = query_data,
+                                  fasta_path = query_sequence_path,
+                                  msa_path = query_msa_path,
+                                  ipr_path = query_ipr_path,
+                                  queries = queries, cln_path = cln_path
+        )
+        com_blast_path <- paste0(path, "/blast_combined.tsv")
+        com_blast_data <- combineFilesNopmap(paste0(path, "/"),
+                                               pattern = "*.full_analysis.tsv",
+                                               skip = 0,
+                                               col_names = c(),
+                                               outpath = com_blast_path,
+                                               delim = "\t"
+        )
+        com_blast_data <- merge(com_blast_data, query_data, by = "Query")
+        com_blast_data <- com_blast_data %>%
+            select(QueryName, everything()) %>%
+            arrange(desc(PcPositive))
+        wrapper_data <- new("MolEvolData",
+                            df = com_blast_data, queries = queries,
+                            ipr_path = "", cln_path = com_blast_path,
+                            fasta_path = query_sequence_path,
+                            msa_path = query_msa_path,
+                            domainSeqs = "", domain_ipr_path = ""
+        )
+        return(list(wrapper_data, query_wrapper_data))
+    } else {
+        stop("Unrecognized type. Please use 'full', 'dblast', 'phylo', or 'da'.")
+    }
+}
+
+drop_empty <- function(df) {
+    for (c in colnames(df)) {
+        if (all(is.na(df[, ..c])) || all(df[, ..c] == "")) {
+            df <- df %>% select(-contains(c))
+        }
+    }
+    return(df)
+}
+
+
+clean_fetched <- function(df) {
+    df <- as.data.frame(df)
+    # df = df %>% cleanup_species()
+
+    # Cleanup domarchs
+    cols <- colnames(df)
+    for (c in cols) {
+        if (grepl("^DomArch", c)) {
+            # Repeats
+            old <- c
+            new <- paste0(c, ".repeats")
+            df <- df %>% cleanup_domarch(
+                old = old, new = new,
+                domains_rename = NULL, # domains_rename,
+                domains_keep = NULL, # filter applied to only ClustName for now.
+                domains_ignore = NULL, # !! should check and remove soon!
+                repeat2s = FALSE,
+                remove_tails = F, # new! check below if it works!
+                remove_empty = F
+            )
+
+            new <- c
+            df <- df %>% cleanup_domarch(
+                old = old, new = new,
+                domains_rename = NULL, # domains_rename,
+                domains_keep = NULL, # filter applied to only ClustName for now.
+                domains_ignore = NULL, # !! should check and remove soon!
+                repeat2s = TRUE,
+                remove_tails = F, # new! check below if it works!
+                remove_empty = F
+            )
+        }
+    }
+
+
+    return(df)
+}
+
+# Description
+# validation functions for accession numbers/headers for FASTA (validateAccNumFasta)
+# & accession number input (validateAccNum) submission types.
+
+library(Biostrings)
+library(httr)
+library(httr2)
+library(rentrez)
+library(shiny)
+
+getAccNumFromFasta <- function(biostrings_aa_string_set, verbose = FALSE) {
+    # parsing/cleaning accession numbers using the same methods as
+    # `upstream_scripts/00_submit_full.R`'s `get_sequences()` function
+    accnums <- c()
+    for (header in names(biostrings_aa_string_set)) {
+        # case 1 UnitProtKB formatted FASTA
+        # header: https://www.uniprot.org/help/fasta-headers
+        if (grepl("\\|", header)) {
+            accnum <- unlist(strsplit(header, "\\|"))[2]
+            # case 2 NCBI formatted FASTA
+            # header: https://www.ncbi.nlm.nih.gov/genbank/fastaformat
+        } else if (grepl(" ", header)) {
+            accnum <- unlist(strsplit(header, " "))[1]
+            # case 3 neither delimiter present; use the whole header
+        } else {
+            accnum <- header
+        }
+        # print debug info
+        if (verbose == TRUE) {
+            cat("header:", header, "\n")
+            cat("accnum_parsed:", accnum, "\n")
+        }
+        accnums <- append(accnums, accnum)
+    }
+    print(accnums)
+    return(accnums)
+}
+
+validateAccNumFasta <- function(text) {
+    # INPUT: text from input object that houses FASTA data
+    # validate the headers/accnums for FASTA submission
+    # Return:
+    #   T: when no duplicate accnums after parsing
+    #   F: when duplicate accnums are present after parsing
+
+    # TRY write tmp file for biostrings parsing
+    path <- tryCatch(
+        expr = {
+            path <- tempfile()
+            write(text, path)
+            path
+        },
+        error = function(e) {
+            NULL
+        }
+    )
+    if (is.null(path)) {
+        cat("failed to write temp fasta during accnum validation\n", file = stderr())
+        return(FALSE) # validation fail
+    }
+
+    # TRY read seq
+    fasta <- tryCatch(
+        expr = {
+            fasta <- Biostrings::readAAStringSet(path)
+            fasta
+        },
+        error = function(e) {
+            NULL
+        },
+        finally = {
+            unlink(path)
+        }
+    )
+    if (is.null(fasta)) {
+        cat("failed to read sequence during accnum validation\n", file = stderr())
+        return(FALSE) # validation fail
+    }
+
+    # parse accnums from fasta
+    accnums <- getAccNumFromFasta(fasta)
+    tb_accnum_counts <- tibble("frequencies" = table(accnums))
+    # check for duplicates
+    if (any(tb_accnum_counts$frequencies > 1)) {
+        cat("duplicate headers found during fasta validation\n", file = stderr())
+        return(FALSE) # validation fail
+    } else {
+        return(TRUE)
+    }
+}
+
+
+# Test whether a single accession returns a valid protein from Entrez
+isAccNumValidEntrez <- function(accnum, verbose = FALSE) {
+    # empty accnum wil not raise an error from efetch, so test for this first
+    if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
+
+    # try performing a POST of the accnum, followed by a GET request for a protein
+    # sequence
+    # rentrez::entrez_fetch() returns an error when there's no protein,
+    # so we're using try statements to handle this and return FALSE upon error
+    result <- tryCatch(
+        expr = {
+            result <- rentrez::entrez_fetch(db = "protein", id = accnum, rettype = "fasta")
+            if (verbose) {print(result)}
+            TRUE
+        },
+        error = function(e) {if (verbose) {print(e)}; FALSE}
+    )
+    return(result)
+}
+
+# Test whether a single accession returns a valid protein from EBI
+isAccNumValidEbi <- function(accnum, verbose = FALSE) {
+    # validation: ensure there's some text to parse
+    if (nchar(accnum) <= 0) {if (verbose) {warning("empty accnum")}; return(FALSE)}
+    # construct a httr2 request to POST an accession number, then GET the fasta
+    url_base <- "https://www.ebi.ac.uk/proteins/api/proteins?accession="
+    url_protein <- paste0(url_base, accnum)
+    req <- httr2::request(url_protein) |>
+        httr2::req_headers("Accept" = "text/x-fasta", "Content-type" = "application/x-www-form-urlencoded")
+
+    # wrap in try since a failed HTTP request will raise an R error
+    try(httr2::req_perform(req))
+    # get the HTTP response code
+    resp <- httr2::last_response()
+    # assign result based on HTTP response code ('200' is a success)
+    result <- ifelse(resp$status == 200, TRUE, FALSE)
+    if (verbose) {
+        msg <- stringr::str_glue(
+            "EBI protein query results:\n",
+            "\taccnum: {accnum}\t validation result: {result}\n"
+        ) |> print()
+    }
+    return(result)
+}
+
+# Perform a series of API reqs using NCBI entrez to validate accession numbers
+performEntrezReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+    # API guidelines docs
+    # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+    # entrez recommends no more than 3 POSTs per second
+    # simple method, sleep for 1 second after every 3rd POST
+    i <- 0
+    results <- vapply(
+        X = accnums,
+        FUN = function(accnum) {
+            result <- isAccNumValidEntrez(accnum, verbose = TRUE)
+            i <<- i + 1L
+            if (i >= 3L) {Sys.sleep(1); print('sleeping for entrez API reqs'); i <<- 0L}
+            if (track_progress) {incProgress(1)}
+            result
+        },
+        FUN.VALUE = logical(1)
+    )
+    return(results)
+}
+
+performEbiReqs <- function(accnums, verbose = FALSE, track_progress = FALSE) {
+    # API guidelines docs
+    # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+    # EBI allows 200 POSTs per second
+    # simple method, sleep for 1 second after every 200th POST
+    i <- 0
+    results <- vapply(
+        X = accnums,
+        FUN = function(accnum) {
+            result <- isAccNumValidEbi(accnum, verbose = TRUE)
+            i <<- i + 1L
+            if (i >= 200L) {Sys.sleep(1); i <<- 0L}
+            if (track_progress) incProgress(1)
+            result
+        },
+        FUN.VALUE = logical(1)
+    )
+    return(results)
+}
+
+# Validate accession numbers from MolEvolvR user input
+validateAccNum <- function(text, verbose = FALSE, track_progress = FALSE, n_steps = integer()) {
+    # API guidelines docs
+    # entrez https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
+    # ebi: https://www.ebi.ac.uk/proteins/api/doc/index.html
+
+    # get accnums from three possible delimiters
+    # 1. comma separated values with any amount of space between them
+    # 2. one or more space delimiter
+    # 3. "\n" (newline) delimiter
+    accnums <- unlist(strsplit(text, "\\s*,\\s*|\\s+|\\n|,"))
+
+    warning()
+
+    if (track_progress) {incProgress(1 / n_steps,
+        message = "Testing for duplicate accession numbers . . .")}
+    # fail fast for duplicates
+    if (any(duplicated(accnums))) {
+        warning("duplicate accesion numbers found")
+        if (track_progress) {setProgress(n_steps,
+            message = "Duplicate accession numbers found")}
+        return(FALSE)
+    }
+    if (track_progress) {incProgress(1 / n_steps,
+        message = "Please wait (searching NCBI's protein database) . . .")}
+    # entrez API
+    entrez_results <- performEntrezReqs(accnums, verbose = verbose)
+    if (verbose) {
+        stringr::str_glue("rentrez
+                results:\n\t{paste0(entrez_results, collapse=",")}\n") |>
+            print()
+    }
+
+    # EBI API
+    # if all of entrez resulted in success, then skip EBI. Else, try EBI
+    if (!all(entrez_results)) {
+        ebi_results <- performEbiReqs(accnums, verbose = verbose)
+        if (verbose) {
+            stringr::str_glue("ebi results:\n\t{paste0(ebi_results,
+                              collapse=",")}\n") |>
+                print()
+        }
+        if (track_progress) {incProgress(1 / n_steps, message = "Please wait
+            (searching EBI's protein database) . . .")}
+        # OR test each accnum result across both dbs
+        final_results <- ebi_results | entrez_results
+    } else {
+        if (track_progress) {setProgress(n_steps, message = "All accession
+                                         numbers validated")}
+        final_results <- entrez_results
+    }
+    failed_accnums <- accnums[which(!final_results)]
+    if (length(failed_accnums) >= 1) {
+        msg <- stringr::str_glue(
+            "The following accnums failed validation:\n",
+            "\t{paste0(failed_accnums, collapse=",")}\n"
+        )
+        warning(msg)
+        if (track_progress) {setProgress(n_steps, message = "Accession number
+                                         validation failed")}
+    }
+    setProgress(n_steps, message = "Accession number validation finished")
+    return(
+        list(
+            "validation_result" = all(final_results),
+            "failed_accnums" = failed_accnums
+        )
+    )
+}
+
+#===============================================================================
+# Description
+#   validation function for evalue input
+#===============================================================================
+validateEvalue <- function(input_value) {
+    is_valid_evalue <- is.numeric(input_value) && input_value != 0
+    return(is_valid_evalue)
+}
+
+#===============================================================================
+# Author(s): JK
+# Last modified: 2023_06
+#===============================================================================
+#-------------------------------------------------------------------------------
+guessSeqType <- function(single_fasta, dna_guess_cutoff = 0.9, other_guess_cutoff = 0.5) {
+    tb <- as_tibble(alphabetFrequency(single_fasta))
+    n_other <- if ("other" %in% colnames(tb)) sum(unlist(tb["other"])) else 0
+
+    aa_cols <- setdiff(Biostrings::AA_ALPHABET, c("*", "."))
+    tb_dna <- tb %>% select(all_of(Biostrings::DNA_ALPHABET))
+    tb_aa <- tb %>% select(all_of(aa_cols))
+
+    total <- nchar(single_fasta)
+    est_dna_prop <- sum(unlist(tb_dna)) / total
+    est_aa_prop <- sum(unlist(tb_aa)) / total
+    other_prop <- n_other / total
+    cat(
+        names(single_fasta), "\n",
+        "estimated DNA alphabet proportion:", est_dna_prop, "\n",
+        "estimated AA alphabet proportion:", est_aa_prop, "\n",
+        "'other' alphabet proportion:", other_prop, "\n"
+    )
+
+    if (est_dna_prop >= dna_guess_cutoff) {
+        guess <- "DNA"
+    } else if (other_prop >= other_guess_cutoff) {
+        guess <- NA
+    } else {
+        guess <- "AA"
+    }
+    cat("Guess:", guess, "\n")
+    return(guess)
+}
+
+validateSeqBody <- function(text) {
+    # convert string to single letter character vector
+    individual_chars <- unlist(strsplit(text, ""))
+    # return the characters from input that are NOT in the AA_ALPHABET
+    invalid_chars <- setdiff(individual_chars, Biostrings::AA_ALPHABET)
+    n_invalid_chars <- length(invalid_chars)
+
+    # 0 length character vector means the sequence contains
+    # characters that are all present in the AA_ALPHABET
+    # >0 length character vector means there are some characters
+    # which are NOT found in the AA_ALPHABET
+    if (n_invalid_chars == 0) {
+        return(TRUE)
+    } else {
+        warning("A sequence contains characters not found in the AA_ALPHABET")
+        return(FALSE)
+    }
+}
+
+#-------------------------------------------------------------------------------
+validateFasta <- function(fasta_path, .type = "AA") {
+    # handle case of fasta being unreadable/unrecognized format
+    fasta <- tryCatch(
+        expr = Biostrings::readAAStringSet(fasta_path),
+        error = function(e) {
+            warning(paste0("error: failed to read sequence from: ", fasta_path))
+            return(NULL)
+        }
+    )
+    # failure to read yields quick return
+    if (is.null(fasta)) {
+        return(FALSE) # validation fail
+    }
+
+    # require at least some characters for the sequence
+    if (!(sum(nchar(unlist(fasta))) >= 1)) {
+        warning(paste0("fasta does not have any sequence characters"))
+        return(FALSE) # validation fail
+    }
+    print(fasta)
+
+    ### validate sequence BODY
+    # iteratively return boolean value for valid seq body (FALSE = invalid)
+    seq_body_validations <- c()
+    for (i in seq_along(1:length(fasta))) {
+        seq_body <- as.character(fasta[i])
+        seq_body_validations <- c(validateSeqBody(seq_body), seq_body_validations)
+    }
+    # require all sequences to have chars only in AA_ALPHABET
+    if (!(all(seq_body_validations))) {
+        warning("At least one sequence has non-IUPAC AA characters")
+        return(FALSE) # validation fail
+    }
+
+    ### validate sequence TYPE
+    # iteratively guess the type (DNA, AA, or other of each seq in the FASTA file)
+    seq_types <- c()
+    for (idx in seq_along(1:length(fasta))) {
+        seq_types <- c(guessSeqType(fasta[idx]), seq_types)
+    }
+
+    switch(.type,
+           "DNA" = {
+               anti_type <- "AA"
+           },
+           "AA" = {
+               anti_type <- "DNA"
+           }
+    )
+
+    # if all seqs are of desired .type, only then TRUE
+    if (sum(is.na(seq_types)) != 0) {
+        return(FALSE) # validation fail
+    } else if (anti_type %in% seq_types) {
+        return(FALSE) # validation fail
+    } else {
+        return(TRUE) # validation success
+    }
+}
+
+# Author(s): Samuel Chen, Lo Sosinski, Joe Burke
+# Last modified: 2021
+
+alpha_numeric <- c(0:9, letters, LETTERS)
+
+rand_string <- function(length, characters = alpha_numeric, post_fix = "", ignorelist = c()) {
+    str <- NULL
+
+    while (is.null(str) || (str %in% ignorelist)) {
+        chars <- sample(characters, size = length, replace = TRUE)
+        str <- paste0(paste(chars, collapse = ""), post_fix)
+    }
+
+    return(str)
+}
+strsplit_vect <- function(strings, pattern = "", pos) {
+    split_strings <- map(strsplit(strings, "_"), function(x) x[1]) %>% unlist()
+
+    return(split_strings)
+}
diff --git a/R/combine_files.R b/R/combine_files.R
index 4f03b1d2..1c5ac6a2 100755
--- a/R/combine_files.R
+++ b/R/combine_files.R
@@ -24,22 +24,22 @@
 #'
 #' @author Janani Ravi
 #'
-#' @param inpath Character. The master directory path where the files reside. 
+#' @param inpath Character. The master directory path where the files reside.
 #' The search is recursive (i.e., it will look in subdirectories as well).
-#' @param pattern Character. A search pattern to identify files to be combined. 
+#' @param pattern Character. A search pattern to identify files to be combined.
 #' Default is "*full_analysis.tsv".
-#' @param delim Character. The delimiter used in the input files. 
-#' Default is tab ("\t").
-#' @param skip Integer. The number of lines to skip at the beginning of each file. 
+#' @param delim Character. The delimiter used in the input files.
+#' Default is tab \code{"\t"}.
+#' @param skip Integer. The number of lines to skip at the beginning of each file.
 #' Default is 0.
-#' @param col_names Logical or character vector. If TRUE, the first row of each file 
-#' is treated as column names. Alternatively, a character vector can 
+#' @param col_names Logical or character vector. If TRUE, the first row of each file
+#' is treated as column names. Alternatively, a character vector can
 #' be provided to specify custom column names.
 #'
 #' @importFrom purrr pmap_dfr
 #' @importFrom readr cols
 #'
-#' @return A data frame containing the combined contents of all matched files. 
+#' @return A data frame containing the combined contents of all matched files.
 #' Each row will include a new column "ByFile" indicating the source file of the data.
 #'
 #' @export
diff --git a/R/generate_report.R b/R/generate_report.R
new file mode 100644
index 00000000..65b8364d
--- /dev/null
+++ b/R/generate_report.R
@@ -0,0 +1,740 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# import libs
+#' @import readr stringr
+
+#' @export
+# example usage: getCaseStudyReport("Acinetobacter baumannii", "Beta-lactams")
+getCaseStudyReport <- function(pathogen = NULL, drug = NULL, ...) {
+  cat("\n")
+  cat("\033[1;36m")  # Cyan for the "MolEvolvR" header
+  cat("=========================================\n")
+  cat("            M o l E v o l R\n")
+  cat("=========================================\n")
+  cat("\033[0m")
+
+  cat("\033[1;32m")
+  cat("\n>>>>> Case Study Report Generation started... <<<<<\n")
+  cat("\033[0m")
+
+  # Step 1: Fetch the FASTA sequences of the given pathogen and drug
+  getCardData(pathogen, drug)
+
+  # Check if the FASTA file exists after fetching
+  fasta_file <- "filtered_proteins.fasta"
+  if (!file.exists(fasta_file)) {
+    stop("Failed to retrieve FASTA sequences. The file
+         'filtered_proteins.fasta' does not exist.")
+  }
+  # Step 2: Once the FASTA file is ready, call runAnalysis to run pipeline
+  message("FASTA file downloaded, running analysis...")
+
+  # Pass the file to runAnalysis function
+  runAnalysis(file_paths = list(fasta = fasta_file), ...)
+
+  message("Case study report completed")
+}
+
+# get fasta of pathogen and/or drug
+getCardData <- function(pathogen = NULL, drug = NULL) {
+  destination_dir <- "CARD_data"
+  # Check if CARD data exists
+  if (!dir.exists(destination_dir)) {
+    dir.create(destination_dir)
+  }
+  if (!file.exists("CARD_data/aro_index.tsv")) {
+    # Step 1: Download CARD Data
+    download.file("https://card.mcmaster.ca/latest/data", "card_data.tar.bz2")
+    #unzip("card_data.tar.bz2", exdir = "CARD_data")
+    system(paste("tar -xjf card_data.tar.bz2 -C", destination_dir))
+  }
+
+  # Step 2: Open ARO_index.tsv
+  aro_data <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE)
+
+  # Step 3: Map CARD Short Name
+  antibiotics <- read.delim("CARD_data/shortname_antibiotics.tsv",
+                            sep = "\t", header = TRUE)
+  pathogens <- read.delim("CARD_data/shortname_pathogens.tsv",
+                          sep = "\t", header = TRUE)
+
+  aro_data <- aro_data %>%
+    mutate(
+      pathogen = str_extract(CARD.Short.Name, "^[^_]+"),
+      gene = str_extract(CARD.Short.Name, "(?<=_)[^_]+"),
+      drug = str_extract(CARD.Short.Name, "(?<=_)[^_]+$")
+    ) %>%
+    left_join(pathogens, by = c("pathogen" = "Abbreviation")) %>%
+    left_join(antibiotics, by = c("drug" = "AAC.Abbreviation"))
+
+  # Sort names
+  aro_data <- aro_data %>%
+    arrange(Pathogen, Molecule) %>%
+    group_by(Pathogen, Molecule)
+
+  # Filter data based on user input
+  filtered_data <- aro_data
+
+  if (!is.null(pathogen)) {
+    filtered_data <- filtered_data %>% filter(Pathogen == !!pathogen)
+  }
+
+  if (!is.null(drug)) {
+    filtered_data <- filtered_data %>% filter(Molecule == !!drug)
+  }
+
+  # Check if filtered data is empty
+  if (nrow(filtered_data) == 0) {
+    stop("No data found for the specified pathogen and drug.")
+  }
+
+  # Extract protein accessions
+  accessions <- filtered_data$Protein.Accession
+
+  # Function to fetch FASTA sequence from NCBI
+  get_fasta <- function(accession) {
+    entrez_fetch(db = "protein", id = accession, rettype = "fasta")
+  }
+
+  # Download sequences
+  fasta_sequences <- lapply(accessions, get_fasta)
+
+  # Write sequences to a file
+  writeLines(unlist(fasta_sequences), "filtered_proteins.fasta")
+
+  return("FASTA sequences downloaded to 'filtered_proteins.fasta'.")
+}
+
+# Run analysis
+runAnalysis <- function(
+        upload_type = "Fasta",
+        evalue = 0.00001,
+        accnum_fasta_input = "",
+        file_paths = list(accnum = NULL, fasta = tempfile(), msa = tempfile(),
+                          blast = NULL, iprscan = NULL),
+        blast_db = "refseq_protein",
+        blast_hits = 100,
+        blast_eval = 0.0001,
+        acc_homology_analysis = TRUE,
+        acc_da_analysis = TRUE,
+        acc_phylogeny_analysis = FALSE,
+        report_template_path = system.file("report", "report_template.Rmd", package = "MolEvolvR", mustWork = TRUE),
+        output_file = file.path(getwd(), "report.html"),
+        DASelect = "All",
+        mainSelect = NULL,
+        PhyloSelect = "All",
+        q_heatmap_select = "All",
+        DACutoff = 95,
+        GCCutoff = 0.5,
+        query_select = NULL,
+        query_iprDatabases = c(
+          "PfamA", "SMART", "Phobius",
+          "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE",
+          "SuperFamily", "MobiDBLite", "Panther", "Coils"
+        ),
+        query_iprVisType = "Analysis", tree_msa_tool = "ClustalO",
+        levels = 2,
+        DA_Col = "DomArch.PfamA",
+        msa_rep_num = 10,
+        msa_reduce_by = "Species",
+        rval_phylo = FALSE,
+        DA_lin_color = c("default", "viridis", "inferno", "magma", "plasma", "cividis"),
+        networkLayout = c("nice", "grid", "circle", "random"),
+        phylo_select = "All",
+        ...
+
+) {
+
+    ##### Initialize Variables and Classes #####
+    blast_upload_data <- new("blastUpload", df = "", seqs = "")
+    ipr_upload_data <- new("iprUpload", df = "", seqs = "")
+    sequence_upload_data <- new("seqUpload", seqs = "")
+    data <- new("MolEvolData", msa_path = tempfile(),
+                fasta_path = file_paths$fasta)
+    app_data <- new("MolEvolData", msa_path = tempfile(),
+                    fasta_path = file_paths$fasta)
+    query_data <- new("queryData")
+
+    if (length(upload_type) != 1) {
+        stop("upload_type must be a single value.")
+    }
+
+    # Reset default analysis function
+    resetSettings <- function() {
+        acc_homology_analysis <<- FALSE
+        acc_da_analysis <<- FALSE
+        acc_phylogeny_analysis <<- FALSE
+        domain_split <<- FALSE
+    }
+
+    fasta_set <- c("Fasta", "AccNum", "MSA")
+
+    # Update settings based on upload type
+    updateUploadType <- function(upload_type) {
+        switch(upload_type,
+               "Fasta" = {
+                   resetSettings()
+                   acc_homology_analysis <<- TRUE
+                   acc_da_analysis <<- TRUE
+               },
+               "AccNum" = {
+                   resetSettings()
+                   acc_homology_analysis <<- TRUE
+                   acc_da_analysis <<- TRUE
+               },
+               "MSA" = {
+                   resetSettings()
+                   acc_homology_analysis <<- TRUE
+                   acc_da_analysis <<- TRUE
+               },
+               "BLAST Output" = {
+                   resetSettings()
+                   acc_phylogeny_analysis <<- TRUE
+               },
+               "InterProScan Output" = {
+                   resetSettings()
+                   acc_da_analysis <<- TRUE
+               }
+        )
+    }
+
+    updateUploadType(upload_type)
+
+    ####### File Upload Functions ########
+
+    # Update function to return modified object
+    uploadAccNumFile <- function(file_path, seq_obj) {
+        accnum_data <- read_file(file_path)
+        seq_obj@seqs <- accnum_data
+        return(seq_obj)
+    }
+
+    uploadFastaFile <- function(file_path, seq_obj) {
+        fasta_data <- read_file(file_path)
+        seq_obj@seqs <- fasta_data
+        return(seq_obj)
+    }
+
+    uploadMSAFile <- function(file_path, seq_obj) {
+        msa_data <- read_file(file_path)
+        seq_obj@seqs <- msa_data
+        return(seq_obj)
+    }
+
+    uploadBlastFile <- function(file_path, blast_obj) {
+        blast_obj@df <- file_path
+        return(blast_obj)
+    }
+
+    uploadIPRScanFile <- function(file_path, ipr_obj) {
+        ipr_obj@df <- file_path
+        return(ipr_obj)
+    }
+
+    # Now modify the calling section to reassign the modified objects
+    if (!is.null(file_paths$accnum)) {
+        sequence_upload_data <- uploadAccNumFile(file_paths$accnum,
+                                                 sequence_upload_data)
+    }
+    if (!is.null(file_paths$fasta)) {
+        sequence_upload_data <- uploadFastaFile(file_paths$fasta,
+                                                sequence_upload_data)
+    }
+    if (!is.null(file_paths$msa)) {
+        sequence_upload_data <- uploadMSAFile(file_paths$msa,
+                                              sequence_upload_data)
+    }
+    if (!is.null(file_paths$blast)) {
+        blast_upload_data <- uploadBlastFile(file_paths$blast,
+                                             blast_upload_data)
+    }
+    if (!is.null(file_paths$iprscan)) {
+        ipr_upload_data <- uploadIPRScanFile(file_paths$iprscan,
+                                             ipr_upload_data)
+    }
+
+    # Validation of inputs
+    fasta_data <- read_file(file_paths$fasta)
+    validate_and_process_inputs <- function(evalue, fasta_data) {
+        if (!validateEvalue(evalue)) {
+            return("Error: A numeric E-value is required. Please set a valid
+                   value (e.g., 0.0001).")
+        }
+
+
+        if (!validateAccNumFasta(fasta_data)) {
+            return("Error: Input for AccNum/Fasta cannot be empty or invalid.")
+        }
+
+        sequence_vector <- unlist(strsplit(fasta_data, "\n"))
+        return(list(message = "Inputs are valid!", sequences = sequence_vector))
+    }
+
+    validation_result <- validate_and_process_inputs(evalue, fasta_data)
+    if (is.character(validation_result)) {
+        return(validation_result)
+    }
+
+    # Phylogenetic Analysis Validation
+    phylo <- acc_phylogeny_analysis
+    if (phylo) {
+        # Validate phylogenetic analysis based on upload type
+        is_valid_phylo <- switch(
+            upload_type,
+            "Fasta" = {
+                str_count(string = sequence_upload_data@seqs, ">") > 1
+            },
+            "AccNum" = {
+                accnums <- sequence_upload_data@seqs |>
+                    strsplit("\\s*,\\s*|\\s+|\\n|,") |>
+                    unlist()
+                length(accnums) > 1
+            },
+            "MSA" = {
+                str_count(string = sequence_upload_data@seqs, ">") > 1
+            },
+            FALSE  # Default to FALSE if no valid type is found
+        )
+
+        if (!is_valid_phylo) {
+            return("Error: At least two sequences/identifiers are required for
+                   phylogenetic analysis.")
+        }
+    }
+    # Fasta-like submissions
+    if (upload_type %in% fasta_set) {
+        # Can have any combination of select options
+        type <- ""
+        script <- ""
+        postfix <- ""
+
+        if (acc_homology_analysis && !phylo && acc_da_analysis) {
+            type <- "full"
+            postfix <- "full"
+        }
+        # Phylogenetic analysis, do full script but skip blast
+        else if (phylo && acc_da_analysis) {
+            type <- "phylo"
+            postfix <- "phylo"
+        }
+        else if (acc_da_analysis) {
+            type <- "da"
+            postfix <- "da"
+        }
+        else if (acc_homology_analysis) {
+            # Only run BLAST
+            type <- "dblast"
+            postfix <- "dblast"
+        }
+        else {
+            # Something went wrong, throw error
+          stop("Please select one of the above analyses (full, phylo, da,
+               dblast) to run.")
+
+            return()
+        }
+    }
+
+    # After uploading the sequence data, you would check the uploaded data
+    if (sequence_upload_data@seqs == "") {
+        stop("Error: Please upload a protein sequence.")
+    }
+    OUT_PATH <- getwd()
+    unavailable_pins <- list.files(OUT_PATH)
+    unavailable_pins <- strsplit_vect(unavailable_pins, pattern = "_")
+    pinName <- rand_string(length = 6, post_fix = "", ignorelist = unavailable_pins)
+    pin_id <- strtrim(pinName, 6)
+
+    dir <- paste0(OUT_PATH, pin_id, "_", postfix)
+    path <- paste0(dir, "/", pin_id, ".fa")
+    # Adding validation logic based on upload type
+    system(paste0("mkdir ", dir), wait = TRUE)
+    switch(upload_type,
+           "Fasta" = {
+               # Validate sequence limit
+               if (str_count(sequence_upload_data@seqs, ">") > 200) {
+                   stop("Error: Only submissions with less than 200 proteins
+                   are accepted at this time. For analyses with more than 200 p
+                        roteins please contact janani.ravi@cuanschutz.edu.")
+               }
+
+               # Validate accession numbers for FASTA submission
+               if (!(validateAccNumFasta(sequence_upload_data@seqs))) {
+                   stop("Error: Please adjust the FASTA headers. Ensure a header
+                        line for each sequence, no duplicate header names, and
+                        no duplicate protein accession numbers.")
+               }
+
+               # TRY load fasta for fasta validation
+               is_valid_aa_fasta <- tryCatch(
+                   expr = {
+                       tmp_file <- tempfile()
+                       writeLines(sequence_upload_data@seqs, tmp_file)
+                       validateFasta(tmp_file)
+                   },
+                   error = function(e) {
+                       warning("Error: Failed to run input FASTA verification.")
+                       return(FALSE)  # Return FALSE if an error occurs
+                   },
+                   finally = {
+                       unlink(tmp_file)
+                   }
+               )
+
+               # Validate AA fasta
+               if (!(is_valid_aa_fasta)) {
+                   stop("Error: The FASTA input could not be recognized as valid
+                   Amino Acid (AA) sequence(s). MolEvolvR only accepts FASTA
+                   formatted AA/protein sequences as input (not DNA/RNA).
+                   For a short reference on FASTA format, review this
+                   https://blast.ncbi.nlm.nih.gov/doc/blast-topics/short NCBI guide.")
+               }
+
+               write(sequence_upload_data@seqs, path)
+           },
+           "AccNum" = {
+               # Convert Acc to FASTA
+               accnum_vect <- unlist(strsplit(sequence_upload_data@seqs, "\\s*,\\s*|\\s+|\\n|,"))
+               if (length(accnum_vect) > 200) {
+                   stop("Error: Only submissions with less than 200 proteins are
+                   accepted at this time. For analyses with more than 200
+                        proteins please contact janani.ravi@cuanschutz.edu.")
+               }
+
+               # if an error is raised, validation fails
+               validation_results <- purrr::map_lgl(
+                   accnum_vect,
+                   function(accnum) {
+                       tryCatch(
+                           expr = {
+                               tmp <- tempfile(
+                                   pattern = paste0("molevolvr_acccnum_validation-", accnum, "-", "XXXXX"),
+                                   fileext = ".fa"
+                               )
+                               acc2FA(accnum, tmp)
+                               readAAStringSet(tmp)
+                               TRUE
+                           },
+                           error = function(e) {
+                               FALSE
+                           },
+                           finally = {
+                               suppressWarnings(sink())
+                               unlink(tmp)
+                           }
+                       )
+                   }
+               )
+
+               # If any accession numbers fail, halt submission
+               if (!all(validation_results)) {
+                   stop("Error: MolEvolvR could not locate sequences for the
+                   following accession numbers:
+                   [ {paste0(accnum_vect[which(!validation_results)], collapse = ', ')} ]
+      Please try submitting FASTA sequences instead.")
+               } else {
+                   # Write a multifasta
+                   acc2FA(accnum_vect, outpath = path)
+               }
+           },
+           "MSA" = {
+               # Validate sequence limit
+               if (str_count(sequence_upload_data@seqs, ">") > 200) {
+                   stop("Error: Only submissions with less than 200 proteins
+                   are accepted at this time. For analyses with more than 200
+                        proteins please contact janani.ravi@cuanschutz.edu.")
+               }
+
+               msa_temp <- str_replace_all(sequence_upload_data@seqs, "-", "")
+               write(msa_temp, path)
+           }
+    )
+
+    phylo <- if_else(phylo, "TRUE", "FALSE")
+
+    # Clean FASTA headers
+    fasta <- Biostrings::readAAStringSet(filepath = path)
+    headers_original <- names(fasta)
+    headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x))
+    fasta <- cleanFAHeaders(fasta)
+    headers_cleaned <- names(fasta)
+
+    # Write a table to map original accnums to their cleaned version
+    readr::write_tsv(
+        x = tibble::tibble(
+            "header_original" = headers_original,
+            "header_accnum" = headers_accnum,
+            "header_clean" = headers_cleaned
+        ),
+        file = file.path(dir, "query-fasta_header-map.tsv")
+    )
+
+    # Remove '*' characters which are incompatible with interproscan
+    fasta <- gsub(pattern = "\\*", replacement = "X",
+                  x = fasta) |> Biostrings::AAStringSet()
+    Biostrings::writeXStringSet(x = fasta, filepath = path)
+
+    if (domain_split) {
+        submit_split_by_domain(
+            dir = dir,
+            sequences = path,
+            DB = blast_db,
+            NHITS = blast_hits,
+            EVAL = blast_eval,
+            phylo = phylo,
+            type = type,
+            job_code = pin_id,
+        )
+    } else {
+        runFull(
+            dir = dir,
+            sequences = path,
+            DB = blast_db,
+            NHITS = blast_hits,
+            EVAL = blast_eval,
+            phylo = phylo,
+            type = type,
+            job_code = pin_id,
+            APPLICATION = query_iprDatabases
+        )
+    }
+
+    # Additional handling for different upload types
+    if (upload_type == "BLAST Output") {
+        dir <- paste0(OUT_PATH, pin_id, "_blast")
+        if (blast_upload_data@df == "") {
+            stop("Error: Please upload a BLAST file.")
+        }
+        if (blast_upload_data@seqs == "" && !blast_ncbi_check) {
+            stop("Error: Please provide a file containing sequences or check
+                 the box to use fetch sequences for NCBI accession numbers.")
+        }
+
+        if (tools::file_ext(blast_upload_data@df) == "tsv" || blast_upload_data@df == EX_BLASTOUTPUT) {
+            data <- read_tsv(blast_upload_data@df, col_names = web_blastp_hit_colnames)
+        } else {
+            data <- read_csv(blast_upload_data@df, col_names = web_blastp_hit_colnames)
+        }
+
+        if (nrow(data) > 5005) {
+            stop("Error: BLAST output submissions are limited to 5000 proteins
+            at the moment. For analyses with more than 5000 proteins please
+                 contact janani.ravi@cuanschutz.edu.")
+        }
+
+        system(paste0("mkdir ", dir), wait = TRUE)
+        blast_path <- paste0(dir, "/", pin_id, ".wblast.tsv")
+        write_tsv(data, blast_path, col_names = FALSE)
+
+        queries <- data$Query %>% unique()
+        writeLines(queries, paste0(dir, "/", "accs.txt"))
+
+        if (blast_ncbi_check) {
+            submit_blast(
+                dir = dir,
+                blast = paste0(dir, "/", pin_id, ".wblast.tsv"),
+                seqs = paste0(dir, "/", "seqs.fa"),
+                ncbi = TRUE,
+                job_code = pin_id,
+                submitter_email = notify_email,
+                advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist())
+            )
+        } else {
+            seqs <- read_file(blast_upload_data@seqs)
+            writeLines(seqs, paste0(dir, "/", "seqs.fa"))
+            submit_blast(
+                dir = dir,
+                blast = paste0(dir, "/", pin_id, ".wblast.tsv"),
+                seqs = paste0(dir, "/", "seqs.fa"),
+                ncbi = FALSE,
+                job_code = pin_id,
+                submitter_email = notify_email,
+                advanced_options = isolate(rvals_advanced_options |> reactiveValuesToList() |> unlist())
+            )
+        }
+
+    } else if (upload_type == "InterProScan Output") {
+        dir <- paste0(OUT_PATH, pin_id, "_ipr")
+        path <- paste0(dir, "/", pin_id, "_ipr.tsv")
+        if (ipr_upload_data()@df == "") {
+            stop("Error: Please upload an interproscan file.")
+        }
+        if (ipr_upload_data()@seqs == "" && !ipr_ncbi_check) {
+            stop("Error: Please provide a file containing sequences or check the
+                box to fetch sequences using NCBI accession numbers.")
+        }
+
+        ipr <- read_tsv(ipr_upload_data()@df, col_names = FALSE)
+        if (nrow(ipr) > 200) {
+            stop("Error: Only submissions with less than 200 proteins are
+            accepted at this time. For analyses with more than 200 proteins
+                 please contact janani.ravi@cuanschutz.edu.")
+        }
+
+        system(paste0("mkdir ", dir), wait = TRUE)
+        writeLines("AccNum\tSeqMD5Digest\tSLength\tAnalysis\tDB.ID\tSignDesc\tStartLoc\tStopLoc\tScore\tStatus\tRunDate\tIPRAcc\tIPRDesc\tGOTerms\tExtra\n", path)
+        write_tsv(ipr, path, col_names = FALSE, append = TRUE)
+
+        ncbi <- if_else(ipr_ncbi_check, TRUE, FALSE)
+        blast <- if_else(acc_homology_analysis, TRUE, FALSE)
+
+        if (ncbi) {
+            submit_ipr(
+                dir = dir,
+                ipr = path,
+                blast = acc_homology_analysis,
+                seqs = paste0(dir, "/", "seqs.fa"),
+                ncbi = TRUE,
+                DB = blast_db,
+                NHITS = blast_hits,
+                EVAL = blast_eval,
+            )
+        } else {
+            seqs <- read_file(ipr_upload_data()@seqs)
+            writeLines(seqs, paste0(dir, "/", "seqs.fa"))
+            submit_ipr(
+                dir = dir,
+                ncbi = FALSE,
+                ipr = path,
+                blast = acc_homology_analysis,
+                seqs = paste0(dir, "/", "seqs.fa"),
+                DB = blast_db,
+                NHITS = blast_hits,
+                EVAL = blast_eval,
+
+            )
+        }
+
+    }
+
+    # Process results for results
+    fetched <- processWrapperDir(dir, pinName = pinName, type = type)
+
+    # Assign fetched data to objects if the fetched list has the expected length
+    if (length(fetched) == 2) {
+        data <- fetched[[1]]       # Assign the first element to 'data'
+        app_data <- fetched[[1]]   # Assign the same to 'app_data'
+        query_data <- fetched[[2]] # Assign the second element to 'query_data'
+
+        r_nrow_initial <- nrow(fetched[[1]]@df)  # Initialize row count
+    }
+
+    domarch_cols_value <- getDomArchCols(app_data, DASelect)
+
+    query_domarch_cols_value <- getQueryDomArchCols(query_data@df)
+
+    mainTable_value <- getDataTable(data)
+
+    queryDataTable_value <- getQueryDataTable(query_data, query_select)
+
+    fastaDataText_value <- getFastaData(query_data@fasta_path)
+
+    domainDataText_value <- getDomData(data)
+
+    msaDataText_value <- getMSAData(query_data@msa_path)
+
+    rs_IprGenes_value <- getIPRGenesVisualization(data,
+                                                          app_data,
+                                                          query_iprDatabases,
+                                                          query_iprVisType)
+
+    rs_network_layout_value <- getRSNetworkLayout(data,
+                                                          app_data,
+                                                          cutoff = 100,
+                                                          layout = "nice")
+
+    rs_data_table_value <- getDataTable(data)
+
+    da_IprGenes_value <- getDomArchIPRGenesPlot(app_data,
+                                                query_iprDatabases,
+                                                query_iprVisType,
+                                                DASelect)
+
+    query_heatmap_value <- getQueryHeatmap(query_data@df,
+                                                  heatmap_select = "All",
+                                                  heatmap_color = "blue")
+
+    DA_Prot_value <- getDomArchProt(app_data, DASelect)
+
+    DALinPlot_value <- getDomArchHeatmapPlot(DA_col = "DomArch.PfamA",
+                                                DACutoff,
+                                                DA_Prot_value,
+                                                DA_lin_color = "viridis",
+                                                app_data@ipr_path)
+
+    DALin_TotalCounts_value <- getDomArchTotalCounts(DA_Prot_value,
+                                              DACutoff,
+                                              DA_col = "DomArch.PfamA",
+                                              app_data)
+
+    DALinTable_value <- getDomArchLinearTable(DA_col = "DomArch.PfamA",
+                                              app_data@ipr_path,
+                                              DALin_TotalCounts_value)
+
+    DANetwork_value <- getDomNetwork(DA_col = "DomArch.PfamA",
+                                               DACutoff,
+                                               DA_Prot_value,
+                                               networkLayout = "nice",
+                                               app_data@ipr_path)
+
+    rep_accnums_value <- repAccNums(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data)
+
+    phylogeny_prot_value <- filterPhylogenyProteins(app_data, phylo_select)
+
+    acc_to_name_value <- acc2Name(app_data)
+
+    ####### Report Generation ########
+
+    tryCatch({
+        tempReport <- file.path(dir, "report.Rmd")
+        file.copy(report_template_path, tempReport, overwrite = TRUE)
+
+        # List of graphics to include in report
+        params <- list(
+            rs_interproscan_visualization = rs_IprGenes_value,
+            proximity_network = rs_network_layout_value,
+            sunburst = data@df,
+            data = rs_data_table_value,
+            queryDataTable = queryDataTable_value,
+            fastaDataText = fastaDataText_value,
+            heatmap = query_heatmap_value,
+            query_data = query_data,
+            query_domarch_cols = query_domarch_cols_value,
+            query_iprDatabases = query_iprDatabases,
+            query_iprVisType = query_iprVisType,
+            mainTable = mainTable_value,
+            DALinTable = DALinTable_value,
+            DALinPlot = DALinPlot_value,
+            DANetwork = DANetwork_value,
+            DA_Prot = DA_Prot_value,
+            domarch_cols = domarch_cols_value,
+            DA_Col = DA_Col,
+            DACutoff = DACutoff,
+            da_interproscan_visualization = da_IprGenes_value,
+            phylo_sunburst_levels = levels,
+            phylo_sunburst = phylogeny_prot_value,
+            tree_msa_tool = tree_msa_tool,
+            repAccNums = rep_accnums_value,
+            msa_rep_num = 3,
+            app_data = app_data,
+            PhyloSelect = PhyloSelect,
+            acc2Name = acc_to_name_value,
+            rval_phylo = rval_phylo,
+            msa_reduce_by = msa_reduce_by
+        )
+
+        # Render RMarkdown report
+        rmarkdown::render(tempReport,
+                          output_file = output_file,
+                          params = params,
+                          envir = new.env(parent = globalenv()))
+        browseURL(output_file)
+    }, error = function(e) {
+        return(paste("Error in report generation:", e$message))
+    })
+
+    return("Initialization, upload, and
+           report generation completed successfully.")
+}
+
diff --git a/R/ipr2viz.R b/R/ipr2viz.R
index e582ab09..51650761 100644
--- a/R/ipr2viz.R
+++ b/R/ipr2viz.R
@@ -79,7 +79,7 @@ themeGenes2 <- function() {
 #' n = 20, query = "specific_query_name")
 #' }
 getTopAccByLinDomArch <- function(infile_full,
-    DA_col = "DomArch.Pfam",
+    DA_col = "DomArch.PfamA",
     lin_col = "Lineage_short",
     n = 20,
     query) {
@@ -94,14 +94,14 @@ getTopAccByLinDomArch <- function(infile_full,
     cln_domarch <- cln %>% select(domarch_cols)
     col_counts <- colSums(is.na(cln_domarch))
     DA_sym <- sym(names(which.min(col_counts)))
-    showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations"))
+    # showNotification(paste0("Selecting representatives by unique ", DA_sym, " and lineage combinations"))
     ## Group by Lineage, DomArch and reverse sort by group counts
     grouped <- cln %>%
         group_by({{ DA_sym }}, {{ lin_sym }}) %>%
         arrange(desc(PcPositive)) %>%
         summarise(count = n(), AccNum = dplyr::first(AccNum)) %>%
         arrange(-count) %>%
-        filter({{ lin_sym }} != "" && {{ DA_sym }} != "")
+        filter({{ lin_sym }} != "" & {{ DA_sym }} != "")
     top_acc <- grouped$AccNum[1:n]
     top_acc <- na.omit(top_acc)
     return(top_acc)
@@ -180,14 +180,14 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = MolEvolvR::iprscan_cols)
     ipr_out <- ipr_out %>% filter(.data$Name %in% accessions)
     analysis_cols <- paste0("DomArch.", analysis)
-    infile_full <- infile_full %>% select(.data$analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
+    infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
     ## To filter by Analysis
     analysis <- paste(analysis, collapse = "|")
     ## @SAM: This can't be set in stone since the analysis may change!
     ## Getting top n accession numbers using getTopAccByLinDomArch()
     top_acc <- getTopAccByLinDomArch(
         infile_full = infile_full,
-        DA_col = "DomArch.Pfam",
+        DA_col = "DomArch.PfamA",
         ## @SAM, you could pick by the Analysis w/ max rows!
         lin_col = "Lineage_short",
         n = topn, query = query
@@ -212,7 +212,7 @@ plotIPR2Viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     analysis_labeler <- analyses %>%
         pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
-    lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
+    lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
@@ -360,7 +360,7 @@ plotIPR2VizWeb <- function(infile_ipr,
 
     ## @SAM, colnames, merges, everything neeeds to be done now based on the
     ## combined lookup table from "common_data"
-    lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
+    lookup_tbl_path <- system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE)
     lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = MolEvolvR::lookup_table_cols)
 
     ## Read IPR file and subset by Accessions
diff --git a/R/networks_domarch.R b/R/networks_domarch.R
index ae6fe8be..b171b3a8 100755
--- a/R/networks_domarch.R
+++ b/R/networks_domarch.R
@@ -24,20 +24,20 @@
 #' A network of domains is returned based on shared domain architectures.
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes 
+#' @param column Name of column containing Domain architecture from which nodes
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Character. Color to represent the queried domain in the 
+#' @param query_color Character. Color to represent the queried domain in the
 #' network.
 #'
 #' @importFrom dplyr across add_row all_of distinct filter mutate pull select
-#' @importFrom igraph delete_vertices graph_from_edgelist vertex
+#' @importFrom igraph delete_vertices graph_from_edgelist vertex V
 #' @importFrom purrr map
 #' @importFrom rlang sym
 #' @importFrom shiny showNotification
@@ -174,21 +174,21 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
                 if ("X" %in% V(g)$name) {
                     g <- delete_vertices(g, "X")
                 }
-                V(g)$size <- as.numeric(wc[V(g)$name])
+                igraph::V(g)$size <- as.numeric(wc[igraph::V(g)$name])
 
-                V(g)$size <- (V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 20 + 10 # scaled by degree
+                igraph::V(g)$size <- (igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 20 + 10 # scaled by degree
 
                 # setting vertex color by size
-                V(g)$color <- rainbow(5, alpha = .5)[round((V(g)$size - min(V(g)$size)) / (max(V(g)$size) - min(V(g)$size)) * 4 + 1)]
-                V(g)$frame.color <- V(g)$color
+                igraph::V(g)$color <- rainbow(5, alpha = .5)[round((igraph::V(g)$size - min(igraph::V(g)$size)) / (max(igraph::V(g)$size) - min(igraph::V(g)$size)) * 4 + 1)]
+                igraph::V(g)$frame.color <- igraph::V(g)$color
                 # scaling edge width
-                E(g)$width <- e.sz
-                E(g)$width <- ifelse(log(E(g)$width) == 0, .3, log(E(g)$width))
+                igraph::E(g)$width <- e.sz
+                igraph::E(g)$width <- ifelse(log(igraph::E(g)$width) == 0, .3, log(igraph::E(g)$width))
                 # coloring edges by width
                 ew <- c(2.7, 4.5)
-                E(g)$color <- sapply(
-                    E(g)$width,
-                    function(x) if (x >= ew[1] && x <= ew[2]) E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) E(g)$color <- adjustcolor("maroon", alpha.f = .5) else E(g)$color <- "gray55"
+                igraph::E(g)$color <- sapply(
+                    igraph::E(g)$width,
+                    function(x) if (x >= ew[1] && x <= ew[2]) igraph::E(g)$color <- adjustcolor("cadetblue", alpha.f = .7) else if (x > ew[2]) igraph::E(g)$color <- adjustcolor("maroon", alpha.f = .5) else igraph::E(g)$color <- "gray55"
                 )
                 vis_g <- visIgraph(g, type = "full")
             }
@@ -211,7 +211,7 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
                 visOptions(highlightNearest = TRUE)
         },
         error = function(e) {
-            showNotification(toString(e))
+            # showNotification(toString(e))
             vis_g <- "error"
         },
         finally = {
@@ -231,18 +231,18 @@ createDomainNetwork <- function(prot, column = "DomArch", domains_of_interest, c
 #'
 #'
 #' @param prot A data frame that contains the column 'DomArch'.
-#' @param column Name of column containing Domain architecture from which nodes 
+#' @param column Name of column containing Domain architecture from which nodes
 #' and edges are generated.
 #' @param domains_of_interest Character vector specifying the domains of interest.
-#' @param cutoff Integer. Only use domains that occur at or above the cutoff for 
+#' @param cutoff Integer. Only use domains that occur at or above the cutoff for
 #' total counts if cutoff_type is "Total Count".
-#' Only use domains that appear in cutoff or greater lineages if cutoff_type is 
+#' Only use domains that appear in cutoff or greater lineages if cutoff_type is
 #' Lineage.
 #' @param layout Character. Layout type to be used for the network. Options are:
 #' \itemize{\item "grid" \item "circle" \item "random" \item "auto"}
-#' @param query_color Color that the nodes of the domains in the 
+#' @param query_color Color that the nodes of the domains in the
 #' domains_of_interest vector are colored
-#' @param partner_color Color that the nodes that are not part of the 
+#' @param partner_color Color that the nodes that are not part of the
 #' domains_of_interest vector are colored
 #' @param border_color Color for the borders of the nodes.
 #' @param IsDirected Is the network directed? Set to false to eliminate arrows
diff --git a/R/pre-msa-tree.R b/R/pre-msa-tree.R
index 75cc375d..6e2e8fd7 100644
--- a/R/pre-msa-tree.R
+++ b/R/pre-msa-tree.R
@@ -676,7 +676,7 @@ acc2FA <- function(accessions, outpath, plan = "sequential") {
                 id = accessions_partitioned[[x]],
                 db = "protein",
                 rettype = "fasta",
-                api_key = Sys.getenv("ENTREZ_API_KEY")
+                # api_key = Sys.getenv("ENTREZ_API_KEY")
             )
         )
     })
@@ -809,9 +809,9 @@ alignFasta <- function(fasta_file, tool = "Muscle", outpath = NULL) {
         abort("Error: The FASTA file does not exist: ", fasta_file)
     }
     
-    if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
-        abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
-    }
+    # if (file_ext(fasta_file) != "fasta" && file_ext(fasta_file) != "fa") {
+    #    abort("Error: The specified file is not a valid FASTA file: ", fasta_file)
+    # }
     fasta <- readAAStringSet(fasta_file)
 
     aligned <- switch(tool,
@@ -869,11 +869,11 @@ writeMSA_AA2FA <- function(alignment, outpath) {
         abort("Error: The output directory does not exist: ", outdir)
     }
 
-    l <- length(rownames(alignment))
+    l <- length(names(unmasked((alignment))))
     fasta <- ""
     for (i in 1:l)
     {
-        fasta <- paste0(fasta, paste(">", rownames(alignment)[i]), "\n")
+        fasta <- paste0(fasta, paste(">", names(unmasked((alignment)))[i]), "\n")
         seq <- toString(unmasked(alignment)[[i]])
         fasta <- paste0(fasta, seq, "\n")
     }
@@ -955,4 +955,4 @@ getAccNumFromFA <- function(fasta_file) {
 #   cfile <- read_delim("data/alignments/pspc.gismo.aln", delim=" ")
 #   cfile <- as.data.frame(map(cfile,function(x) gsub("\\s+", "",x)))
 #   colnames(cfile) <- c("AccNum", "Alignment")
-# }
+# }
\ No newline at end of file
diff --git a/R/run_pipeline.R b/R/run_pipeline.R
new file mode 100644
index 00000000..fe1fa4c9
--- /dev/null
+++ b/R/run_pipeline.R
@@ -0,0 +1,1343 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# import libs
+#' @import readr data.table httr rentrez iprscanr
+
+getSeqs <- function(sequences,
+                          acc_file_path = "accs.txt",
+                          dir_path = "~",
+                          separate = TRUE) {
+  seqs <- readAAStringSet(sequences)
+  cln_names <- c()
+  for (accnum in names(seqs)) {
+    if (grepl("\\|", accnum)) {
+      accnum_cln <- strsplit(accnum, "\\|")[[1]][2]
+      accnum_cln <- strsplit(accnum_cln, " ")[[1]]
+    } else {
+      accnum_cln <- strsplit(accnum, " ")[[1]][1]
+    }
+    cln_names <- append(cln_names, accnum_cln)
+    write(accnum_cln, file = acc_file_path, append = TRUE)
+    if (separate) {
+      write(paste0(dir_path, "/", accnum_cln, ".faa"),
+            file = "input.txt", append = TRUE)
+      write(paste0(">", accnum_cln),
+            file = paste0(accnum_cln, ".faa"), append = TRUE)
+      write(toString(seqs[accnum]),
+            file = paste0(accnum_cln, ".faa"), append = TRUE)
+    }
+  }
+  names(seqs) <- cln_names
+  writeXStringSet(seqs, sequences, format = "fasta")
+  return(length(seqs))
+}
+runFull <- function(
+    dir = "/data/scratch",
+    DB = Sys.getenv("BLAST_DB", unset = "refseq"),
+    NHITS = Sys.getenv("BLAST_HITS", unset = 100),
+    EVAL = Sys.getenv("BLAST_EVALUE", unset = 0.00001),
+    sequences = "~/test.fa",
+    phylo = "FALSE",
+    by_domain = "FALSE",
+    domain_starting = "~/domain_seqs.fa",
+    type = "full",
+    job_code=NULL,
+    submitter_email=NULL,
+    advanced_options=NULL,
+    get_slurm_mails=FALSE,
+    APPLICATION
+) {
+  # Set working directory
+  setwd(dir)
+
+  advanced_options_names <- names(advanced_options[advanced_options == TRUE])
+
+  # Write job submission params to file
+  job_args <- list(
+    submission_type = type,
+    database = ifelse(phylo == FALSE, DB, NA),
+    nhits = ifelse(phylo == FALSE, NHITS, NA),
+    evalue = ifelse(phylo == FALSE, EVAL, NA),
+    submitter_email = submitter_email,
+    advanced_options = advanced_options_names,
+    job_code = job_code
+  )
+  yml <- yaml::as.yaml(job_args)
+  write(yml, "job_args.yml")
+
+  # Create a log file
+  write("START_DT\tSTOP_DT\tquery\tdblast\tacc2info\tdblast_cleanup\tacc2fa
+        \tblast_clust\tclust2table\tiprscan\tipr2lineage\tipr2DomArch\tduration",
+        "logfile.tsv")
+
+  # Process sequences (local handling)
+  if (phylo == "FALSE") {
+    # Split the sequences if needed, store them locally
+    num_seqs <- getSeqs(sequences, dir_path = dir, separate = TRUE)
+
+    fasta <- Biostrings::readAAStringSet(sequences)
+    headers_original <- names(fasta)
+    headers_accnum <- names(fasta) |> purrr::map_chr(function(x) extractAccNum(x))
+
+    # Execute BLAST locally (instead of submitting jobs to the cluster)
+    for (i in 1:num_seqs) {
+      # Assume each sequence is saved separately
+      input_file <- paste0(dir, "/", headers_accnum[i], ".faa")
+      # output_file <- paste0(dir, "/blast_output_", i, ".txt")
+
+      # Construct the local BLAST command (make sure 'blastn' is available locally)
+      runMolevolvrPipeline(input_file, DB, NHITS, EVAL, is_query = F, type, i, APPLICATION = APPLICATION)
+
+      #cmd <- sprintf(
+      #  "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %f -remote",
+      #  input_file, DB, output_file, NHITS, EVAL
+      #)
+
+      # Execute BLAST locally
+      #system(cmd)
+
+      cat(sprintf("BLAST for sequence %d completed.\n", i), file=stderr())
+    }
+  } else {
+    # Handle phylogenetic analysis if needed
+    cat("Phylogenetic analysis is not supported in the local version yet.\n")
+  }
+
+  # Simulate query run locally
+  runMolevolvrPipeline(sequences, DB, NHITS, EVAL, is_query = TRUE, type, APPLICATION = APPLICATION)
+  # cmd_query <- sprintf(
+  #   "deltablast -query %s -db %s -out %s_query.txt -num_alignments %d -evalue %f -remote",
+  #   sequences, DB, paste0(dir, "/query_output"), NHITS, EVAL
+  # )
+  # system(cmd_query)
+
+  cat("Query analysis completed.\n")
+
+  # Status update
+  num_runs <- num_seqs + 1
+  write(paste0("0/", num_runs, " analyses completed"), "status.txt")
+
+  cat("All analyses completed locally.\n")
+}
+
+# Define the main pipeline function
+runMolevolvrPipeline <- function(input_paths, db, nhits, eval,
+                                   is_query, type, i, APPLICATION) {
+
+  # Start time
+  start <- Sys.time()
+  OUTPATH <- getwd()   # Set output directory to the current working directory
+
+  # If IS_QUERY is True, handle query data
+  if (is_query == TRUE) {
+
+    FILE <- input_paths
+    PREFIX <- "query_data"
+    OUTDIR <- file.path(OUTPATH, PREFIX)
+    dir.create(OUTDIR, showWarnings = FALSE)  # Create output directory
+
+    all_accnums_file <- file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa"))
+    file.copy(FILE, all_accnums_file)
+
+    # Parse accession numbers
+    accnums_file <- file.path(OUTPATH, "query-fasta_header-map.tsv")
+    parsed_accnums_file <- file.path(OUTDIR, "parsed_accnums.txt")
+
+    if (file.exists(accnums_file)) {
+      parsed_accnums <- read.table(accnums_file,
+                                   header = FALSE, sep = "\t",
+                                   stringsAsFactors = FALSE)
+      writeLines(parsed_accnums$V2[-1], parsed_accnums_file)  # Skip header
+    }
+
+    # Copy starting_accs.txt if exists, or fallback to accs.txt
+    if (file.exists("starting_accs.txt")) {
+      file.copy("starting_accs.txt",
+                file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")))
+    } else {
+      file.copy("accs.txt",
+                file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")))
+      file.copy(parsed_accnums_file,
+                file.path(OUTDIR, paste0(PREFIX, ".parsed_accnums.txt")))
+    }
+
+    # setwd(OUTDIR)
+
+    # Run acc2info
+    runAcc2Info(parsed_accnums_file, PREFIX, OUTDIR)
+
+    replaceAccNums(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+                              file.path(OUTPATH, "query-fasta_header-map.tsv"),
+                              file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")))
+
+    file.copy(file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+              file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
+
+  } else {
+    # Handle homolog data
+    FILE <- readLines(input_paths)[1]  # Assume single file for local usage
+    F_value <- basename(FILE)
+    PREFIX <- sub("\\.faa$", "", basename(FILE))
+    PREFIX <- gsub(">", "", PREFIX)  # Extract prefix from file name
+    OUTDIR <- file.path(OUTPATH, paste0(PREFIX))
+    dir.create(OUTDIR, showWarnings = FALSE)  # Create output directory
+    # setwd(OUTDIR)
+
+    # Run DELTABLAST
+    runDELTABLAST(input_paths, PREFIX, OUTDIR, db, nhits, eval)
+
+    # Run ACC2FA
+    convertAccNum2Fasta(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+                            PREFIX, OUTDIR)
+
+    # Run ACC2INFO
+    runAcc2Info(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.txt")),
+                 PREFIX, OUTDIR)
+
+    # Clean up BLAST results
+    cleanupBlast(file.path(OUTDIR, paste0(PREFIX, ".dblast.tsv")),
+                  file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")),
+                  PREFIX, F)
+
+  }
+
+  # Sys.sleep(30)
+
+  # Run BLASTCLUST
+  runCDHIT(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+                 PREFIX, OUTDIR )
+
+  # Convert clusters to table
+  clust2Table(file.path(OUTDIR, paste0(PREFIX, ".bclust.L60S80.tsv")),
+            file.path(OUTDIR, paste0(PREFIX, ".blast.cln.tsv")))
+
+  # Run INTERPROSCAN
+  runIPRScan2(file.path(OUTDIR, paste0(PREFIX, ".all_accnums.fa")),
+                  prefix =  PREFIX, outdir = OUTDIR, appl = APPLICATION)
+
+  # Run IPR2LIN
+  ipr2Linear(file.path(OUTDIR, paste0(PREFIX, ".iprscan.tsv")),
+          file.path(OUTDIR, paste0(PREFIX, ".acc2info.tsv")), PREFIX)
+
+
+  # Optionally run ipr2DomArch if IS_QUERY is true
+  if (is_query == TRUE) {
+    ## perform ipr2DomArch on iprscan results
+    da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+                 PREFIX, "NA")
+
+    ## if blast results are provided, call appendIPR
+    if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
+        is.na(PREFIX)) {
+      print("No blast results provided, moving on.")
+    } else {
+      file.copy(file.path(OUTDIR, paste0(PREFIX, ".ipr_domarch.tsv")),
+                file.path(OUTDIR, paste0(PREFIX, ".full_analysis.tsv")))
+    }
+  } else {
+    # perform ipr2DomArch on iprscan results
+    da <- ipr2DomArch(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv")),
+                 PREFIX)
+
+    ## if blast results are provided, call appendIPR
+    if (is.null(file.path(OUTDIR, paste0(PREFIX, ".iprscan_cln.tsv"))) |
+        is.na(PREFIX)) {
+      print("No blast results provided, moving on.")
+    } else {
+      appendIPR(ipr_da = da,
+                 blast =  file.path(OUTDIR, paste0(PREFIX, ".cln.clust.tsv")),
+                 prefix = PREFIX)
+    }
+  }
+
+  # Copy the input fasta file to the output directory
+  # file.copy(input_paths, OUTDIR)
+
+  # Total run time
+  dur <- difftime(Sys.time(), start, units = "secs")
+  cat("\nTotal run time:", dur, "seconds\n")
+
+  # Log the run times
+  logfile_path <- file.path(OUTPATH, "logfile.tsv")
+  log_data <- data.frame(
+    START_DT = format(start, "%d/%m/%Y-%H:%M:%S"),
+    STOP_DT = format(Sys.time(), "%d/%m/%Y-%H:%M:%S"),
+    PREFIX = PREFIX,
+    dur = dur
+  )
+
+  write.table(log_data, file = logfile_path, sep = "\t",
+              row.names = FALSE, col.names = FALSE,
+              append = TRUE, quote = FALSE)
+}
+
+# Define the acc2info function
+acc2info <- function(infile, prefix, outdir) {
+  # Ensure output directory exists
+  if (!dir.exists(outdir)) {
+    dir.create(outdir, recursive = TRUE)
+  }
+
+  outfile <- file.path(outdir, paste0(prefix, ".acc2info.tsv"))
+
+  # Print column names to the output file
+  # Print column names to outfile
+
+  # Read the input file of accession numbers
+  acc_nums <- readLines(infile)
+
+  # Random sleep to avoid overloading the server
+  # Sys.sleep(sample(1:10, 1))
+
+  # EPOST to get WebEnv and QueryKey
+  epost_response <- entrez_post(db = "protein", id = acc_nums)
+  webenv <- epost_response$WebEnv
+  query_key <- epost_response$QueryKey
+  # EFETCH to get the document summaries
+  docsums <- entrez_summary(db = "protein", web_history = epost_response)
+
+  # Check if any atomic values exist in docsums
+  any_atomic <- FALSE
+  # Check if any value in the current docsum is atomic
+  for (docsum in docsums) {
+    # Check if any value in the current docsum is atomic
+    if (is.atomic(docsum)) {
+      any_atomic <- TRUE
+      break  # Exit loop early if an atomic value is found
+    }
+  }
+
+  if (any_atomic) {
+    parsed_data <- data.frame(
+      AccNum = docsums$oslt$value,
+      AccNum.noV = docsums$caption,
+      FullAccNum = docsums$extra,
+      Description = docsums$title,
+      Length = docsums$slen,
+      TaxID = docsums$taxid,
+      Species = docsums$organism,
+      SourceDB = ifelse(is.null(docsums$sourcedb), NA,
+                        docsums$sourcedb),
+      Completeness = ifelse(is.null(docsums$completeness), NA,
+                            docsums$completeness),
+      stringsAsFactors = FALSE  # Avoid factors in data frame
+    )
+  } else {
+    # If no atomic values exist, use lapply to parse each docsum
+    parsed_data <- do.call(rbind, lapply(docsums, function(docsum) {
+      tryCatch({
+        data.frame(
+          AccNum = docsum$oslt$value,
+          AccNum.noV = docsum$caption,
+          FullAccNum = docsum$extra,
+          Description = docsum$title,
+          Length = docsum$slen,
+          TaxID = docsum$taxid,
+          Species = docsum$organism,
+          SourceDB = ifelse(is.null(docsum$sourcedb), NA,
+                            docsum$sourcedb),
+          Completeness = ifelse(is.null(docsum$completeness), NA,
+                                docsum$completeness),
+          stringsAsFactors = FALSE  # Avoid factors in data frame
+        )
+      }, error = function(e) {
+        # Return a data frame filled with NA values in case of an error
+        return(data.frame(
+          AccNum = NA,
+          AccNum.noV = NA,
+          FullAccNum = NA,
+          Description = NA,
+          Length = NA,
+          TaxID = NA,
+          Species = NA,
+          SourceDB = NA,
+          Completeness = NA,
+          stringsAsFactors = FALSE
+        ))
+      })
+    }))
+  }
+
+  # Parse the fetched data
+  # Check if we got any data
+  if (nrow(parsed_data) == 0) {
+    cat("No data found for accession numbers in NCBI. Trying UniProt...\n")
+
+    # Split input file into smaller chunks
+    split_files <- split(acc_nums, ceiling(seq_along(acc_nums) / 100))
+
+    for (i in seq_along(split_files)) {
+      accnum <- paste(split_files[[i]], collapse = ",")  # Join with comma
+      url <- paste0("https://www.ebi.ac.uk/proteins/api/proteins?accession=",
+                    accnum)
+
+      response <- GET(url, accept("application/xml"))
+
+      if (status_code(response) == 200) {
+        xml_content <- content(response, as = "text")
+
+        # Parse the XML response
+        parsed_uniprot <- xml2::read_xml(xml_content)
+
+        # Extract required elements (customize based on XML structure)
+        entries <- xml2::xml_find_all(parsed_uniprot, "//entry")
+
+        for (entry in entries) {
+          accession <- xml2::xml_text(xml2::xml_find_first(entry,
+                                                           ".//accession"))
+          full_name <- xml2::xml_text(xml2::xml_find_first(entry,
+                                                           ".//fullName"))
+          length_seq <- xml2::xml_attr(xml2::xml_find_first(entry,
+                                                            ".//sequence"),
+                                                            "length")
+          db_reference <- xml2::xml_attr(xml2::xml_find_first(entry,
+                                                              ".//dbReference"),
+                                                              "id")
+          dataset <- xml2::xml_attr(entry, "dataset")
+          name <- xml2::xml_text(xml2::xml_find_first(entry, ".//name"))
+
+          # Create a row of data
+          new_row <- data.frame(AccNum = accession,
+                                AccNum.noV = gsub("\\|", "", accession),
+                                FullAccNum = accession,
+                                Description = full_name,
+                                Length = as.integer(length_seq),
+                                TaxID = db_reference,
+                                Species = name,
+                                SourceDB = dataset,
+                                Completeness = "NA",  # Adjust as needed
+                                stringsAsFactors = FALSE)
+
+          # Append to the outfile
+          write.table(new_row, file = outfile, sep = "\t", col.names = TRUE,
+                      row.names = FALSE, quote = FALSE, append = TRUE)
+        }
+      }
+    }
+  } else {
+    # Append NCBI data to outfile
+    write.table(parsed_data, file = outfile, sep = "\t", col.names = TRUE,
+                row.names = FALSE, quote = FALSE, append = TRUE)
+  }
+
+  cat("Data saved to:", outfile, "\n")
+}
+
+acc2InfoPhylo <- function(infile, outdir) {
+  # Ensure output directory exists
+  if (!dir.exists(outdir)) {
+    dir.create(outdir, recursive = TRUE)
+  }
+
+  outfile <- file.path(outdir, "acc2info.tsv")
+
+  # Create a data frame to store results
+  results <- data.frame(
+    Caption = character(),
+    Extra = character(),
+    Title = character(),
+    Length = character(),
+    TaxID = character(),
+    Organism = character(),
+    SourceDB = character(),
+    Completeness = character(),
+    stringsAsFactors = FALSE
+  )
+
+  # Process accession numbers in batches to avoid overloading
+  for (acc in acc_nums) {
+    # Fetch data using epost and efetch
+    response <- GET(sprintf("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&format=xml&id=%s",
+                            acc))
+
+    # Check if the request was successful
+    if (response$status_code == 200) {
+      # Parse the XML response
+      parsed_xml <- read_xml(content(response, "text"))
+
+      # Extract relevant information
+      doc_summary <- xml_find_all(parsed_xml, ".//DocumentSummary")
+
+      for (summary in doc_summary) {
+        results <- rbind(results, data.frame(
+          Caption = xml_text(xml_find_first(summary, ".//Caption")),
+          Extra = xml_text(xml_find_first(summary, ".//Extra")),
+          Title = xml_text(xml_find_first(summary, ".//Title")),
+          Length = xml_text(xml_find_first(summary, ".//Slen")),
+          TaxID = xml_text(xml_find_first(summary, ".//TaxId")),
+          Organism = xml_text(xml_find_first(summary, ".//Organism")),
+          SourceDB = xml_text(xml_find_first(summary, ".//SourceDb")),
+          Completeness = xml_text(xml_find_first(summary, ".//Completeness")),
+          stringsAsFactors = FALSE
+        ))
+      }
+    } else {
+      warning(sprintf("Failed to fetch data for accession %s: %s",
+                      acc, response$status_code))
+    }
+  }
+
+  # Write results to the output file
+  write.table(results, file = outfile, sep = "\t", row.names = FALSE,
+              quote = FALSE)
+}
+
+# Main function to run based on the prefix
+runAcc2Info <- function(infile, prefix, outdir) {
+  if (prefix == "NA") {
+    acc2InfoPhylo(infile, outdir)
+  } else {
+    acc2info(infile, prefix, outdir)
+  }
+}
+
+subsAccnum4cc2Info <- function(df_acc2info, df_header_map) {
+  df_result <- df_header_map |>
+    # set column name in header map to match accnum col in acc2info
+    dplyr::rename(AccNum = header_accnum) |>
+    # join onto header map
+    dplyr::left_join(df_acc2info, by = "AccNum") |>
+    # deselect accnum
+    dplyr::select(-AccNum) |>
+    # set the accnum col to the cleaned form
+    dplyr::rename(AccNum = header_clean) |>
+    # rm excess columns from header map file
+    dplyr::select(-header_original)
+  return(df_result)
+}
+
+replaceAccNums <- function(path_acc2info,
+                                      path_query_header_map, path_out) {
+
+  # Read the input files
+  df_acc2info <- read_tsv(path_acc2info)
+  df_query_header_map <- read_tsv(path_query_header_map)
+
+  # Substitute accession numbers
+  df_acc2info_substituted <- subsAccnum4cc2Info(df_acc2info,
+                                                            df_query_header_map)
+
+  # Print the substituted dataframe
+  print("### df_acc2info_substituted")
+  print(df_acc2info_substituted)
+
+  # Write the substituted dataframe to the output file
+  write_tsv(df_acc2info_substituted, file = path_out, col_names = TRUE)
+}
+
+runDELTABLAST <- function(infile, prefix, outdir,
+                           db = "refseq_protein",
+                           nhits = 5000, evalue = 1e-5,
+                           threads = 10) {
+
+  # Prepare output file path
+  # outfile <- file.path(outdir, paste0(prefix, ".dblast.tsv"))
+
+  outfile <- paste0(outdir, "/" , prefix, ".dblast.tsv")
+
+  # Print I/O messages
+  cat("\nNow processing:", infile, "\n")
+  cat("Running against:", db, "\n")
+  cat("E-value: ≤", evalue, "\n")
+  cat("Top", nhits, "hits/alignments\n")
+  cat("Output filepath:", outfile, "\n")
+
+  # Designating database based on user input
+  dblast_db <- switch(db,
+                      nr = "nr",
+                      refseq_protein = "refseq_protein",
+                      stop("Invalid database specified.
+                           Choose either 'nr' or 'refseq'."))
+
+  # Core command for running DELTABLAST
+  command <- sprintf(
+    "deltablast -query %s -db %s -out %s -num_alignments %d -evalue %s -outfmt '6 qacc sacc sseqid sallseqid stitle sscinames staxids pident length mismatch gapopen qstart qend qlen sstart send slen evalue bitscore ppos' -remote",
+    infile, dblast_db, outfile, nhits, evalue
+  )
+  # Run the DELTABLAST command
+  system(command, intern = TRUE)
+
+  cat("DELTABLAST completed.\n")
+}
+
+# This script converts AccNum to Fasta using NCBI's EDirect or EBI's API
+convertAccNum2Fasta <- function(infile, prefix, outdir) {
+
+  # Create the output file path
+  outfile <- file.path(outdir, paste0(prefix, ".all_accnums.fa"))
+
+  # Print status
+  cat("\n####################\n")
+  cat("BEGIN EDIRECT SEARCH\n")
+  cat("####################\n")
+  cat("Processing input file:", infile, "\n")
+
+  # Create temporary files based on the number of columns in input file
+  temp_acc_file <- file.path(outdir, paste0(prefix, ".all_accnums.txt"))
+  cols <- length(strsplit(readLines(infile, n = 1), "\t")[[1]])
+
+  if (cols > 1) {
+    # Extract unique homolog accession numbers from the 2nd column
+    cat("Creating temp file with 2nd column unique accessions...\n")
+    system(sprintf("awk -F '\\t' '{ print $2 }' %s | sort -u > %s", infile, temp_acc_file))
+  } else {
+    # Extract unique homolog accession numbers from the 1st column
+    cat("Creating temp file with 1st column unique accessions...\n")
+    system(sprintf("awk -F '\\t' '{ print $1 }' %s | sort -u > %s", infile, temp_acc_file))
+  }
+
+  # Split accessions into chunks of 1000
+  system(sprintf("split -l 1000 -e %s %s/acc", temp_acc_file, outdir))
+
+  # Fetch FASTA sequences
+  cat("\nObtaining FASTA files\n")
+  acc_files <- list.files(path = outdir, pattern = "^acc", full.names = TRUE)
+
+  for (x in acc_files) {
+    # Accession numbers in a comma-separated format
+    accnum <- paste(readLines(x), collapse = ",")
+    fasta_sequence <- entrez_fetch(db = "protein", id = accnum,
+                                   rettype = "fasta")
+    # Write the fetched sequence to the output file
+    writeLines(fasta_sequence, con = outfile)
+  }
+
+  # Check if any sequences were retrieved
+  num_seqs <- as.numeric(system(sprintf("grep '>' %s | wc -l", outfile),
+                                intern = TRUE))
+
+  # If no sequences found, try fetching from EBI API
+  if (num_seqs < 1) {
+    cat("\nNo sequences retrieved. Trying EBI API...\n")
+    for (x in acc_files) {
+      accnum <- paste(readLines(x), collapse = ",")
+      system(sprintf(
+        "curl -X GET --header 'Accept:text/x-fasta' 'https://www.ebi.ac.uk/proteins/api/proteins?accession=%s' >> %s",
+        accnum, outfile
+      ))
+
+    }
+  }
+
+  # Remove temporary files
+  cat("\nRemoving temporary files\n")
+  system(sprintf("rm %s/acc*", outdir))
+
+  # Print end message
+  cat("#####################\n")
+  cat("END OF EDIRECT SEARCH\n")
+  cat("#####################\n")
+}
+
+cleanupBlast <- function(infile_blast, acc2info, prefix, wblast = F) {
+
+  outdir <- dirname(infile_blast)
+  # Load and clean acc2info file
+  acc2info_out <- fread(input = acc2info, sep = "\t",
+                        header = TRUE, fill = TRUE) %>%
+    mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
+    mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
+
+  # If using web-blast output
+  if (wblast == TRUE) {
+    blast_out <- fread(input = infile_blast, sep = "\t", header = FALSE,
+                       col.names = web_blastp_hit_colnames, fill = TRUE)
+    cleanedup_blast <- blast_out %>%
+      mutate(AccNum = gsub("\\|", "", AccNum)) %>%
+      mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>%
+      mutate(PcIdentity = round(as.double(PcIdentity), 2))
+
+    # Merge cleaned blast output with acc2info
+    cleanedup_blast <- merge(cleanedup_blast, acc2info_out,
+                             by.x = "AccNum",
+                             by.y = "FullAccNum",
+                             all.x = TRUE)
+    names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species"
+
+    # Additional calculations for PcPositive
+    cleanedup_blast <- cleanedup_blast %>%
+      mutate(PcPosOrig = as.numeric(PcPosOrig)) %>%
+      mutate(AlnLength = as.numeric(AlnLength)) %>%
+      mutate(PcPositive = PcPosOrig)
+
+  } else if (wblast == FALSE) {
+    # If using classic-blast output
+    blast_out <- read_tsv(file = infile_blast, col_names = cl_blast_colnames)
+    cleanedup_blast <- blast_out %>%
+      mutate(AccNum = gsub("\\|", "", AccNum)) %>%
+      mutate(AccNum = gsub(".*[a-z]", "", AccNum)) %>%
+      mutate(Species = gsub(";.*$", "", Species)) %>%
+      mutate(PcIdentity = round(PcIdentity, 2)) %>%
+      mutate(PcPositive = round((PcPosOrig * AlnLength / QLength), digits = 2))
+
+    # Merge cleaned blast output with acc2info
+    cleanedup_blast <- merge(cleanedup_blast, acc2info_out,
+                             by.x = "AccNum",
+                             by.y = "FullAccNum",
+                             all.x = TRUE) %>%
+      select(-Species.x, -TaxID.x)
+    names(cleanedup_blast)[names(cleanedup_blast) == "Species.y"] <- "Species"
+  }
+
+  # TaxID to lineage mapping
+  cleanedup_blast$TaxID <- as.integer(cleanedup_blast$TaxID)
+  lineage_map <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    header = TRUE, fill = TRUE, colClasses = lineage_map_cols)
+
+  # Merge with lineage map and clean up columns
+  mergedLins <- merge(cleanedup_blast, lineage_map, by = "TaxID",
+                      all.x = TRUE) %>%
+    mutate(Species = Species.y, Spp.blast = Species.x) %>%
+    select(any_of(cl_blast_postcln_cols))
+
+  # Add names and prepare the output
+  blast_names <- addName(mergedLins)
+
+  # Create output file name
+  file_name <- file.path(outdir, paste0(prefix, ".blast.cln.tsv"))
+
+  # Write cleaned data to file
+  write_tsv(blast_names, file_name, col_names = TRUE)
+}
+
+# Function to run BLASTCLUST on given input
+runCDHIT <- function(infile, suffix, outdir) {
+
+  # Prepare output file path
+  outfile <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv"))
+  input_file <- file.path(outdir, paste0(suffix, ".bclust.L60S80.tsv.clstr"))
+
+
+  # Print process messages
+  cat("\n#####################################\n")
+  cat("## Now running BLASTCLUST on file(s):", infile, "\n")
+  cat("#####################################\n")
+
+  # Cluster sequences w/ BLASTCLUST/CD-HIT
+  # blastclust_cmd <- paste("blastclust -i", infile, "-o", outfile, "-p T -L .6 -b T -S 80 -a 8")
+  cdhit_command <- sprintf(
+    "cd-hit -i %s -o %s -c 0.8 -aS 0.6 -T 8",
+    infile, outfile
+  )
+  # clean_cdhit_format <- sprintf(
+  #   "awk '/^>Cluster/ {if(NR>1)printf \"\\n\"; next} /WP_/ {start=index($0, \"WP_\"); if(start) {end=index(substr($0, start), \"...\"); if (end == 0) end=length($0); printf \"%%s \", substr($0, start, end-1)}}' %s > %s",
+  #   input_file,
+  #   outfile
+  # )
+
+  cat("\nPerforming BLASTCLUST analysis on", infile, "\n")
+  # system(blastclust_cmd)
+
+  system(cdhit_command)
+  # system(clean_cdhit_format)
+  cleanCDHIT(input_file, outfile)
+  cat("Cleaning CDHIT results completed")
+}
+
+# extract_sequences("input_file.tsv.clstr", "output_file.tsv")
+cleanCDHIT <- function(input_file, output_file) {
+  # Read the input file line by line
+  lines <- readLines(input_file)
+
+  # Initialize an empty vector to store extracted identifiers
+  extracted <- c()
+
+  # Loop through lines and extract content between ">" and "..."
+  for (line in lines) {
+    # Only process lines containing valid sequences (skip cluster headers)
+    if (grepl(">", line) && grepl("\\.\\.\\.", line)) {
+      # Extract the sequence identifier between ">" and "..."
+      match <- regmatches(line, regexpr(">(.*?)\\.\\.\\.", line))
+      if (length(match) > 0) {
+        sequence <- gsub("^>", "", match)  # Remove the leading ">"
+        sequence <- gsub("\\.\\.\\.$", "", sequence)  # Remove trailing dots
+        extracted <- c(extracted, sequence)
+      }
+    }
+  }
+
+  # Write the extracted sequences to the output file
+  writeLines(extracted, output_file)
+
+  cat("Sequences extracted and written to:", output_file, "\n")
+}
+
+# Function to format blastclust output
+clust2Table <- function(clust, blast) {
+  clust_out <- read_tsv(file = clust, col_names = F)
+  blast_out <- read_tsv(file = blast, col_names = T)
+  ## Count the number of accession numbers in a cluster
+  # Counting number of spaces between acc. no. +1
+  clust_out$NumAccs <- map(.x = clust_out$X1, function(x) {
+    (str_count(string = x, pattern = " ") + 1)
+  })
+  ## Create empty vectors to store information
+  empty_vec <- c("ClusterID")
+  empty_vec2 <- c("RowNum")
+  ## Adding empty vectors to dataframe
+  clust_out[, empty_vec] <- NA
+  clust_out[, empty_vec2] <- NA
+  ## Counting number of rows to add to the RowNum column -- used for creating cluster name
+  rows <- as.numeric(rownames(clust_out)) %>%
+    str_pad(width = 4, pad = 0)
+  ## Add row number info to dataframe
+  clust_out[, "RowNum"] <- rows
+  # Name columns
+  colnam <- list("AccNum", "NoAccs", "ClusterID", "NumRows")
+  ## Create cluster name from row number (num of clusters) and num of accessions
+  clust_out <- clust_out %>%
+    `colnames<-`(colnam) %>%
+    mutate(ClusterID = paste0(NumRows, ".", NoAccs))
+
+  # Store data frame column as vector
+  myvar <- c("AccNum", "ClusterID")
+  ## Add only the 2 colummns wanted to a new varible
+  clusters <- clust_out[myvar]
+  # Initialize empty data frame
+  new_clust <- data.frame(ClusterID = character(0),
+                          AccNum = character(0),
+                          stringsAsFactors = F)
+
+  ## Assigning each sseqid to a ClusterID
+  # Iterate over dataframe
+  for (i in 1:nrow(clusters)) {
+    # Extract the cluster name for this row
+    cname <- clusters$ClusterID[i]
+    # Split accession numbers by a space
+    # vals is a vecto of all accession numbers in a row
+    vals <- clusters$AccNum[i] %>%
+      strsplit(split = " ") %>%
+      unlist()
+
+    # Iterate over each element in vals
+    for (v in vals) {
+      # add each accession num w/ corresponding cluster ID to new df
+      new_clust[nrow(new_clust) + 1, ] <- c(cname, v)
+    }
+  }
+  # blast_out$AccNum <- gsub("^>", "", blast_out$AccNum)
+  # blast_out$AccNum <- paste0(">", blast_out$AccNum)
+  blast_clustnames <- merge(blast_out, new_clust, by = "AccNum")
+
+  for (i in 1:nrow(blast_clustnames)) {
+    blast_clustnames$ClusterID[i] <- str_c(blast_clustnames$ClusterID[i])
+  }
+
+  first_prot <- as.data.frame(word(clust_out$AccNum), word(clust_out$ClusterID))
+
+  ## write the new file as a TSV file
+  newarg <- gsub(".bclust.L[0-9][0-9]S[0-9][0-9].tsv", "", clust)
+  # accnum + clusterID
+  write_tsv(new_clust, file = paste0(newarg, ".clustIDs"), append = F)
+  # first protein from every cluster
+  write_tsv(first_prot, file = paste0(newarg, ".clust_reps"),
+            col_names = F, append = F)
+  # cleaned up blast file + clusterIDs
+  write_tsv(blast_clustnames, file = paste0(newarg, ".cln.clust.tsv"),
+            col_names = T, append = F)
+}
+
+# Function to run InterProScan
+runIPRScan2 <- function(query_file, prefix, outdir, appl) {
+
+  # Start InterProScan run
+  cat("\n######################\n")
+  cat("BEGIN INTERPROSCAN RUN\n")
+  cat("Input file:", query_file, "\n")
+  cat("######################\n")
+
+  # Output file path
+  outfile <- file.path(outdir, paste0(prefix, ".iprscan"))
+
+  # Process the input query file with InterProScan
+  cat("Now processing", query_file, "\n")
+
+  # Run InterProScan command
+  # Construct the command
+
+  # get the path to the interproscan.sh script from the environment
+  # variable INTERPROSCAN_CMD, or assume it's on the path if unspecified
+  # iprscan_cmd <- Sys.getenv("INTERPROSCAN_CMD", unset="interproscan.sh")
+  #
+  # command <- paste(
+  #   iprscan_cmd, "-i",
+  #   shQuote(query_file),
+  #   "-b", shQuote(outfile),
+  #   "-f TSV --cpu", Sys.getenv("INTERPROSCAN_CPUS", "4"),
+  #   "--appl Pfam,MobiDBlite,Phobius,Coils,SignalP_GRAM_POSITIVE,",
+  #   "SignalP_GRAM_NEGATIVE,Hamap,Gene3D,SignalP_EUK"
+  # )
+
+  # Run the command
+  # system(command)
+  submit_ipr(path2seq = query_file,
+             outfolder = outdir,
+             appl = appl,
+             email = "jravilab.msu@gmail.com")
+
+  new_header <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis", "DB.ID",
+                  "SignDesc", "StartLoc", "StopLoc", "Score",
+                  "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+  temp_data <- read_tsv(file.path(outdir, paste0("ipr_joined.tsv")))
+
+  # Drop the last two columns
+  temp_data <- temp_data[, -((ncol(temp_data) - 1):ncol(temp_data))]
+
+  # remove last 10 xters in AccNum
+  temp_data$file_name <- substr(temp_data$file_name, 1,
+                                nchar(temp_data$file_name) - 12)
+
+  colnames(temp_data) <- new_header
+
+  write.table(temp_data, file.path(outdir, paste0(prefix, ".iprscan.tsv")),
+              sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
+
+
+  cat("##################\n")
+  cat("END OF IPRSCAN RUN\n")
+  cat("##################\n")
+}
+
+ipr2Linear <- function(ipr, acc2info, prefix) {
+  # read in iprscan results
+  # duplicate rows in iprscan file
+  ipr_in <- read_tsv(ipr, col_names = TRUE) %>%
+    mutate(DB.ID = gsub("G3DSA:", "", DB.ID))
+
+  if (prefix == "query_data") {
+    ipr_in$AccNum <- sub("(.*)([0-9])([0-9])$", "\\1.\\2_\\3", ipr_in$AccNum)
+  }
+
+  acc2info_out <- fread(input = acc2info, sep = "\t", header = T, fill = T) %>%
+    mutate(FullAccNum = gsub("\\|", "", FullAccNum)) %>%
+    mutate(FullAccNum = gsub(".*[a-z]", "", FullAccNum))
+
+  # merge ipr file with acc2info file
+  ipr_in <- ipr_in %>%
+    # remove version number and any other suffices
+    mutate(AccNum.noV = gsub("\\.[0-9].*", "", AccNum))
+
+  ipr_tax <- left_join(ipr_in, acc2info_out, by = "AccNum")
+
+  # read in lineage map
+  lineage_map <- fread(
+    system.file("common_data", "lineage_lookup.txt", package = "MolEvolvR", mustWork = TRUE),
+    header = TRUE, fill = TRUE)
+
+  # merge ipr+info w/ lineage
+  # both tables have a species column, but only
+  # the lineage_map (y) species column is kept
+  ipr_tax <- ipr_tax %>%
+    mutate(TaxID = as.numeric(TaxID))
+
+  ipr_lin <- left_join(ipr_tax, lineage_map, by = "TaxID") |>
+    mutate(Species = Species.y) %>%
+    select(-Species.x, -Species.y)
+
+  # add lookup table to iprscan file
+  lookup_tbl <- fread(
+    system.file("common_data", "cln_lookup_tbl.tsv", package = "MolEvolvR", mustWork = TRUE),
+                      sep = "\t", header = TRUE, fill = TRUE) %>%
+    distinct()
+  if ("AccNum.x" %in% names(ipr_lin)) {
+    ipr_lin <- ipr_lin %>%
+      # deselect the AccNum.y from the lineage table (y) and set
+      # the AccNum.x (x) from the ipr/acc2info tables to simply 'AccNum'
+      mutate(AccNum = AccNum.x) %>%
+      select(-AccNum.x, -AccNum.y)
+  }
+  # run add_name f(x) on ipr+lineage dataframe
+  ipr_lin <- ipr_lin %>%
+    addName() %>%
+    mutate(Name = gsub("^_", "", Name))
+
+  # add domarch info to iprscan + lineage df, only keep what's in x
+  ipr_cln <- left_join(ipr_lin, lookup_tbl, by = "DB.ID")
+
+  # populate empty description/short name columns
+  for (i in 1:nrow(ipr_cln)) {
+    if ((is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") &&
+        (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-")) {
+
+      ipr_cln$SignDesc[i] <- ipr_cln$IPRDesc[i]
+      if (length(ipr_cln$LookupTblDesc[i]) != 0) {
+        ipr_cln$ShortName[i] <- ipr_cln$LookupTblDesc[i]
+      }
+    }
+    if (is.na(ipr_cln$ShortName[i]) || ipr_cln$ShortName[i] == "") {
+      ipr_cln$ShortName[i] <- ipr_cln$SignDesc[i]
+    }
+    if (is.na(ipr_cln$SignDesc[i]) || ipr_cln$SignDesc[i] == "-") {
+      ipr_cln$SignDesc[i] <- ipr_cln$ShortName[i]
+    }
+  }
+  # rename unclear/duplicated columns
+  names(ipr_cln)[names(ipr_cln) == "Description.x"] <- "ProteinName"
+  names(ipr_cln)[names(ipr_cln) == "Description.y"] <- "LookupTblDesc"
+  # deselect the AccNum.noV from the lineage table (y) and set
+  # the AccNum.noV.x (x) from the ipr/acc2info tables to simply 'AccNum.noV'
+  ipr_cln <- ipr_cln |> dplyr::select(-AccNum.noV.y) |>
+    dplyr::mutate(AccNum.noV = AccNum.noV.x) |>
+    dplyr::select(-AccNum.noV.x)
+  # create label column to use in ipr2viz
+  ipr_cln <- ipr_cln %>%
+    mutate(Label = strtrim(ShortName, 30)) %>%
+    mutate(Label = gsub(", .*", "", Label)) %>%
+    mutate(Label = gsub("C-terminal region of a signal",
+                        "C-term signal peptide", Label)) %>%
+    mutate(Label = gsub("N-terminal region of a signal",
+                        "N-term signal peptide", Label)) %>%
+    mutate(Label = gsub("Twin arginine translocation \\(T",
+                        "Tat signal profile", Label)) %>%
+    mutate(Label = gsub("GLUTATHIONE HYDROLASE PROENZYM",
+                        "GLUTATHIONE HYDROLASE PROENZYME", Label)) %>%
+    mutate(Label = gsub("N-terminal nucleophile aminohy",
+                        "Ntn hydrolases", Label)) %>%
+    mutate(Label = gsub("Region of a membrane-bound pro",
+                        "cytoplasmic reg of mem-bound prot",
+                        Label)) %>%
+    mutate(ShortName = gsub("Region of a membrane-bound protein predicted to be
+    outside the membrane, in the cytoplasm.",
+                            "cytoplasmic reg of mem-bound prot", ShortName))
+    outdir <- dirname(ipr)
+  # write results to file
+  write_tsv(ipr_cln, file.path(paste0(outdir, "/",paste0(prefix,".iprscan_cln.tsv"))))
+}
+
+ipr2DomArch <- function(infile_ipr, prefix,
+                   analysis = c(
+                     "PfamA", "SMART", "Phobius",
+                     "Gene3d", "TMHMM", "SignalP_GRAM_POSITIVE",
+                     "SuperFamily", "MobiDBLite", "Panther", "Coils"
+                   )) {
+  # read in cleaned up iprscan results
+  ipr_in <- read_tsv(infile_ipr, col_names = T, col_types = ipr_cln_cols)
+
+  # split dataframe into unique proteins
+  x <- split(x = ipr_in, f = ipr_in$AccNum)
+
+  # plan(strategy = "multicore", .skip = T)
+
+  # within each data.table
+  domarch <- map(x, function(y) {
+    # domarch <- future_map(x, function(y) {
+    acc_row <- data.frame(AccNum = y$AccNum[1], stringsAsFactors = F)
+    DAs <- data.frame(matrix(nrow = 1, ncol = length(analysis)))
+    DA <- y %>%
+      group_by(Analysis) %>%
+      arrange(StartLoc)
+    i <- 1
+    analysis_var <- c(
+      "Pfam", "SMART", "Phobius",
+      "Gene3D", "TMHMM", "SignalP_GRAM_POSITIVE",
+      "SUPERFAMILY", "MobiDBLite", "PANTHER", "Coils"
+    )
+    for (a in analysis_var) {
+      a_da <- DA %>% filter(Analysis == a)
+      if (a == "SignalP_EUK" || a == "SignalP_GRAM_NEGATIVE" ||
+          a == "SignalP_GRAM_POSITIVE") {
+        var_shortname <- "DB.ID"
+      } else {
+        var_shortname <- "ShortName"
+      }
+      var_shortname_sym <- sym(var_shortname)
+      a_da <- a_da %>%
+        ungroup() %>%
+        select({{ var_shortname_sym }}) %>%
+        filter(!is.na({{ var_shortname_sym }})) %>%
+        filter(!is.null({{ var_shortname_sym }})) %>%
+        pull(var_shortname) %>%
+        paste(collapse = "+")
+      DAs[1, i] <- a_da
+      i <- (i + 1)
+    }
+
+    colnames(DAs) <- paste("DomArch", analysis, sep = ".")
+    return(cbind(acc_row, DAs))
+  })
+
+  # select relevant rows from ipr input to add to domarch
+  ipr_select <- ipr_in %>%
+    select(Name, AccNum, Species, TaxID, Lineage, Lineage_long_na,
+           Lineage_long, Lineage_med, Lineage_short, ProteinName,
+           SourceDB, Completeness, AccNum.noV) %>%
+    distinct()
+
+  # combine domarchs to one data frame, merge w/ acc2info
+  domarch2 <- do.call(rbind.data.frame, domarch)
+
+  domarch_lins <- domarch2 %>%
+    merge(ipr_select, by = "AccNum", all.x = T)
+
+  # save domarch_lins file
+  write_tsv(domarch_lins,
+            file = paste0(prefix, "/", prefix, ".ipr_domarch.tsv"),
+            append = F, na = "NA"
+  )
+
+  # return domarch2 dataframe to append to blast results if given
+  return(domarch2)
+}
+
+## function to add results from ipr2DomArch to blast results
+appendIPR <- function(ipr_da, blast, prefix) {
+  # ! an 'AccNum' or 'AccNum.noV' column is required in blast table for joining !
+  blast_out <- read_tsv(blast, col_names = T)
+  if ("AccNum.noV" %in% colnames(blast_out)) {
+    ipr_da <- read_tsv(paste0(prefix, "/", prefix, ".ipr_domarch.tsv"), col_names = T)
+    blast_ipr <- merge(blast_out, ipr_da, by = "AccNum.noV", all.x = T)
+  } else {
+    blast_ipr <- merge(blast_out, ipr_da, by = "AccNum", all.x = T)
+  }
+
+  write_tsv(blast_ipr, file = paste0(prefix, "/", prefix, ".full_analysis.tsv"), na = "NA")
+}
+
+# Web BLAST output
+web_blast_colnames <- c("Query", "AccNum",
+                        "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+                        "QStart", "QEnd", "SStart", "SEnd",
+                        "EValue", "BitScore", "PcPosOrig",
+                        "QSFrames") # specific to "blastx"
+
+# BLAST Command line
+cl_blast_colnames <- c("Query", "SAccNum", "AccNum",
+                       "SAllSeqID", "STitle", "Species", "TaxID",
+                       "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+                       "QStart", "QEnd", "QLength",
+                       "SStart", "SEnd", "SLength",
+                       "EValue", "BitScore", "PcPosOrig",
+                       "PcPositive", "ClusterID") # post-cleanup
+
+# IPRSCAN (web+command-line)
+ipr_colnames <- c("AccNum", "SeqMD5Digest", "SLength", "Analysis",
+                  "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+                  "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+# RPSBLAST
+rps_colnames <- c("AccNum", "DBID", "DBSeqID",
+                  "PcIdentity", "PcPosOrig", # Ppos missing
+                  "AlnLength", "Mismatch",
+                  # Q here is Subject; S here is the matching domain; rename!
+                  "SStart", "SEnd", "DStart", "DEnd",
+                  "EValue", "BitScore", "TaxID") # TaxID missing (NA); remove?
+
+# IPG
+ipg_colnames <- c("IPG.ID", "Source", "NucAccNum",
+                  "NucStart", "NucStop", "Strand",
+                  "AccNum", "ProtDesc",
+                  "Species", "SppStrain", "AssemblyID")
+# Final ColNames
+combo_colnames <- c("Query", "AccNum", "Species", "TaxID", "Lineage",
+                    "PcPositive", "ClusterID",
+                    # "Leaf", # MISSING (useful for all dataviz)
+                    # "AssemblyID", "GeneName", "ProtDesc", # MISSING NOW!?!
+                    "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D",
+                    "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP")
+
+
+## ############ ##
+## COLUMN names ##
+## ############ ##
+## BLAST
+############
+## Web-BLAST
+############
+## Downloaded as HIT-TABLE csv
+# BLASTP and related protein BLASTs
+web_blastp_hit_colnames <- c(
+  "Query", "AccNum",
+  "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+  "QStart", "QEnd", "SStart", "SEnd",
+  "EValue", "BitScore", "PcPosOrig"
+)
+# BLASTX
+web_blastx_colnames <- c(
+  "Query", "AccNum",
+  "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+  "QStart", "QEnd", "SStart", "SEnd",
+  "EValue", "BitScore", "PcPosOrig",
+  "QSFrames"
+) # specific to "blastx"
+
+## Downloaded as Descriptions csv
+# BLASTP and related protein BLASTs
+web_blastp_desc_colnames <- c(
+  "Description", "Species", "CommonName", "TaxID",
+  "BitScore", "TotalScore",
+  "PcQCover", "EValue", "PcIdentity",
+  "SLen", "AccNum"
+)
+
+#####################
+## Command line BLAST
+#####################
+
+# pre-cleanup
+cl_blast_colnames <- c(
+  "Query", "SAccNum", "AccNum",
+  "SAllSeqID", "STitle", "Species", "TaxID",
+  "PcIdentity", "AlnLength", "Mismatch", "GapOpen",
+  "QStart", "QEnd", "QLength",
+  "SStart", "SEnd", "SLength",
+  "EValue", "BitScore", "PcPosOrig"
+)
+
+# post-cleanup
+cl_blast_postcln_cols <- c(
+  "Query", "AccNum",
+  "STitle", "Species", "TaxID", "Lineage", "Lineage_long",
+  "Lineage_long_na", "Lineage_med", "Lineage_short",
+  "PcPositive", "PcIdentity", "AlnLength",
+  "SAccNum", "SAllSeqID",
+  "Mismatch", "GapOpen",
+  "QStart", "QEnd", "QLength",
+  "SStart", "SEnd", "SLength",
+  "EValue", "BitScore", "PcPosOrig", "QueryName"
+)
+
+##########
+## IPRSCAN
+##########
+# ipr_colnames_orig <- c("AccNum", "Seq_MD5_digest", "SeqLen", "Analysis",
+#                        "DB_ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+#                        "Status", "RunDate", "IPRAcc", "IPRDesc")
+
+ipr_colnames <- c(
+  "AccNum", "SeqMD5Digest", "SLength", "Analysis",
+  "DB.ID", "SignDesc", "StartLoc", "StopLoc", "Score",
+  "Status", "RunDate", "IPRAcc", "IPRDesc"
+)
+
+# post cleanup
+##########################
+## NEED TO BE REORDERED ##
+##########################
+ipr_cln_colnames <- c(
+  "DB.ID", "TaxID", "AccNum.noV", "AccNum",
+  "SeqMD5Digest", "SLength", "Analysis", "SignDesc",
+  "StartLoc", "StopLoc", "Score", "Status", "RunDate",
+  "IPRAcc", "IPRDesc", "FullAccNum", "ProteinName",
+  "Length", "SourceDB", "Completeness", "Lineage",
+  "Species", "Name", "ShortName", "LookupTblDesc",
+  "ID", "Label"
+)
+
+###########
+## RPSBLAST
+###########
+rps_colnames <- c(
+  "AccNum", "DB.ID", "DBSeqID",
+  "PcIdentity.Dom", "PcPosOrig.Dom", # "PcPos.Dom", # Ppos missing
+  "AlnLength", "Mismatch",
+  "SStart", "SEnd", "DStart", "DEnd",
+  "EValue", "BitScore", "TaxID"
+) # TaxID missing (NA); remove?
+
+#######################
+## IPG and Lineage maps
+#######################
+ipg_colnames <- c(
+  "IPG.ID", "Source", "NucAccNum",
+  "NucStart", "NucStop", "Strand",
+  "AccNum", "Description",
+  "Species", "Spp.Strain", "AssemblyID"
+)
+
+##################
+## Assembly files
+## Genbank, Refseq
+##################
+assembly_colnames <- c(
+  "AssemblyID",
+  "bioproject", "biosample", "wgs_master", # not used
+  "RefseqCategory", "TaxID", "Spp.TaxID",
+  "Species", "Spp.Strain",
+  "isolate", "version_status", # not used
+  "assembly_level", "release_type", # not used
+  "GenomeStatus",
+  "seq_rel_date", "asm_name", "submitter", # not used
+  "AssemblyID.GBRS",
+  "paired_asm_comp", "ftp_path", # not used
+  "excluded_from_refseq", "relation_to_type_material"
+) # not used
+assembly_sub_colnames <- c(
+  "TaxID", "Spp.TaxID", "Species", "Spp.Strain",
+  "RefseqCategory", "GenomeStatus",
+  "AssemblyID", "AssemblyID.GBRS"
+)
+
+#################
+## Lookup tables
+## in common_data
+#################
+lineage_lookup_colnames <- c("TaxID", "Species", "Lineage_long",
+                             "Lineage_long_na", "Lineage_med",
+                             "Lineage_short", "Lineage")
+domarch_lookup_colnames <- c("DB.ID", "ShortName", "Description", "ID")
+# !! SC and LS will fix other piecemeal files based on these
+
+######################
+## FINAL UPLOADED DATA
+######################
+## Combined data frame that is loaded on to the webapp
+combo_colnames <- c(
+  "Query", "UID", "AccNum", "Species", "TaxID", "Lineage",
+  "PcPositive", "ClusterID", "QueryName",
+  # "AssemblyID", "GeneName", "Description", # MISSING NOW!?!
+  "DomArch.Pfam", "DomArch.COG", "DomArch.Gene3D",
+  "DomArch.TMHMM", "DomArch.Phobius", "DomArch.SignalP",
+  "DomArch.SMART", "DomArch.TIGR"
+)
+
+
+################
+## read tsv colnames
+################
+lookup_table_cols <- cols(
+  DB.ID = col_character(),
+  ShortName = col_character(),
+  Description = col_character(),
+  ID = col_character()
+)
+
+iprscan_cols <- cols(
+  .default = col_character(),
+  TaxID = col_double(),
+  SLength = col_double(),
+  SignDesc = col_character(),
+  StartLoc = col_double(),
+  StopLoc = col_double(),
+  Score = col_double(),
+  Status = col_logical(),
+  IPRAcc = col_character(),
+  IPRDesc = col_character(),
+  Length = col_double(),
+  ShortName = col_character(),
+  LookupTblDesc = col_character(),
+  ID = col_character(),
+  Label = col_character()
+)
+
+ipr_cln_cols <- cols(
+  .default = col_character(),
+  TaxID = col_double(),
+  SLength = col_double(),
+  StartLoc = col_double(),
+  StopLoc = col_double(),
+  Score = col_double(),
+  Status = col_logical(),
+  IPRAcc = col_logical(),
+  IPRDesc = col_logical(),
+  Length = col_double(),
+  ID = col_logical()
+)
+
+lineage_map_cols <- c(
+  "double",
+  "character",
+  "character", "character", "character", "character", "character"
+)
diff --git a/R/summarize.R b/R/summarize.R
index 371473ce..b1fef89d 100644
--- a/R/summarize.R
+++ b/R/summarize.R
@@ -690,7 +690,7 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
         abort("Error: 'digits' must be a non-negative integer.")
     }
     
-    column <- sym(column)
+    # column <- sym(column)
 
     prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
         filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
@@ -698,10 +698,10 @@ totalGenContextOrDomArchCounts <- function(prot, column = "DomArch", lineage_col
 
     prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
     col_count <- prot %>%
-        group_by({{ column }}) %>%
+        group_by(!!sym(column)) %>%
         summarise(totalcount = sum(count))
 
-    total <- left_join(prot, col_count, by = as_string(column))
+    total <- left_join(prot, col_count, by = column)
 
     sum_count <- sum(total$count)
     total <- total %>%
@@ -901,4 +901,4 @@ findParalogs <- function(prot) {
 # cat("Word counts for broken up domains from DAs and DAs from GCs.
 # \nFor e.g.:
 # DA.doms.wc <- query.sub$DA.doms %>%
-#   words2WordCounts()")
+#   words2WordCounts()")
\ No newline at end of file
diff --git a/R/viz_utils.R b/R/viz_utils.R
new file mode 100644
index 00000000..4697204e
--- /dev/null
+++ b/R/viz_utils.R
@@ -0,0 +1,924 @@
+# Author(s): Awa Synthia
+# Last modified: 2024
+
+# import libs
+#' @import dplyr stringr visNetwork DT plotly
+#'
+# Function to generate the InterProScan Visualization
+getIPRGenesVisualization <- function(data, app_data,
+                                    query_iprDatabases = c("PfamA", "Phobius", "TMHMM", "Gene3d"),
+                                    query_iprVisType = "Analysis") {
+
+    # Check if analysis is loaded
+    if (nrow(data@df) == 0 || app_data@ipr_path == "") {
+        stop("Analysis data is not loaded properly or ipr_path is missing.")
+    }
+
+    # If there is no cln_path or PcPositive column is NULL
+    if (length(data@cln_path) == 0 || is.null(data@df$PcPositive)) {
+
+        # Set column name for accessing based on the data
+        n <- if ("name" %in% colnames(data)) {
+            "name"
+        } else {
+            "AccNum"
+        }
+        n <- "Name"  # Hardcoded to "Name" based on original code
+
+        # Call the `ipr2viz_web` function
+        ipr_plot <- plotIPR2VizWeb(
+            infile_ipr = data@ipr_path,
+            accessions = data@df$Name,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
+            name = n
+        )
+
+    } else {
+
+        # Call the `ipr2viz` function with additional arguments
+        ipr_plot <- plotIPR2Viz(
+            infile_ipr = data@ipr_path,
+            infile_full = data@df,
+            accessions = unique(data@df$Name),
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
+            topn = 20,   # This value is hardcoded in the original code
+            query = "All"
+        )
+    }
+
+    # Return the plot object for further use
+    return(ipr_plot)
+}
+
+# Function to generate the domain network layout visualization
+getRSNetworkLayout <- function(data, app_data,
+                                       cutoff = 100,
+                                       layout = "nice") {
+
+    # Check if analysis is loaded and app data has a valid ipr_path
+    if (nrow(data@df) == 0 || app_data@ipr_path == "") {
+        stop("Analysis not loaded or ipr_path missing.")
+    }
+
+    # Extract column names and find the first matching "DomArch" column
+    cols <- colnames(data@df)
+    if (is.null(cols)) {
+        stop("Dataframe columns are missing.")
+    }
+
+    col <- cols[grepl("DomArch.*", cols)][1]
+    if (is.null(col)) {
+        stop("No domain architecture column found.")
+    }
+
+    # Clean up domain architecture columns in the data
+    df_data <- data@df %>%
+        mutate(across(tidyselect::starts_with("DomArch"), cleanString))
+
+    # Generate the network using the domain_network function
+    res_network <- createDomainNetwork(
+        df_data,
+        column = col,
+        domains_of_interest = ".*",
+        cutoff = cutoff,
+        layout = layout
+    )
+
+    # Validate that the result is not an error
+    if (any(res_network == "error")) {
+      stop("Not enough nodes to construct a network.")
+    }
+
+    return(res_network)
+}
+
+# Function to generate the data table
+getDataTable <- function(data) {
+    if (nrow(data@df) == 0) {
+        stop("No data available.
+             Please ensure you have uploaded your data correctly.")
+    }
+
+    # Define the columns to be shown
+    viewing_cols <- c(
+        "AccNum", "QueryName", "Name", "Lineage", "Species",
+        "Length", "PcPositive", "DomArch.Pfam"
+    )
+
+    # Extract the data
+    d <- data@df
+
+    # Identify columns to hide (those not in viewing_cols)
+    hide_cols <- which(!colnames(d) %in% viewing_cols) - 1
+
+    # Add hyperlinks to the "AccNum" column
+    d$AccNum <- paste0("<a href='https://www.ncbi.nlm.nih.gov/protein/",
+                       d$AccNum, "' target='_blank' >", d$AccNum, "</a>")
+
+    # Create the DataTable
+    dt <- DT::datatable(d,
+          rownames = FALSE,
+          filter = "top",  # Enable filtering
+          extensions = c("Buttons"),  # Enable buttons (e.g., CSV export)
+          escape = FALSE,  # Allow HTML rendering (for the hyperlinks)
+          options = list(
+              dom = "frlBtip",  # Layout for the table (filters, buttons, etc.)
+              pageLength = 10,  # Number of rows per page
+              paging = TRUE,  # Enable pagination
+              # Enable regex search
+              search = list(caseInsensitive = TRUE, regex = TRUE),
+              language = list(
+                  searchPlaceholder = "Regex or Text...",
+                  filterPlaceholder = "Test"
+              ),
+              buttons = list(
+                  list(
+                      extend = "colvis",
+                      text = "Add/remove column(s)"
+                  ),
+                  list(
+                      extend = "csv",
+                      text = "Download",
+                      filename = "molevolvr-data",
+                      exportOptions = list(
+                        # Export all data, not just visible page
+                          modifier = list(page = "all")
+                      )
+                  )
+              ),
+              scrollX = FALSE,  # Disable horizontal scrolling
+              fixedHeader = FALSE,  # Disable fixed header
+              columnDefs = list(
+                # Hide columns not in 'viewing_cols'
+                  list(visible = FALSE, targets = hide_cols)
+              )
+          )
+    )
+
+    return(dt)
+}
+
+# Function to generate query data table
+getQueryDataTable <- function(query_data, query_select = NULL) {
+
+    # Check if analysis is loaded and data is available
+    if (nrow(query_data@df) == 0) {
+        stop("No data available. Please ensure you have uploaded your data
+             correctly.")
+    }
+
+    # Define the columns to be shown
+    viewing_cols <- c("QueryName", "Species", "Lineage", "DomArch.Pfam")
+
+    # If no specific queries are selected, display all data
+    if (is.null(query_select)) {
+        d <- query_data@df
+    } else {
+        # Filter the data for selected queries
+        d <- query_data@df %>% filter(QueryName %in% query_select)
+        d <- droplevels(d)
+    }
+
+    # Identify columns to hide (those not in viewing_cols)
+    hide_cols <- which(!colnames(d) %in% viewing_cols) - 1
+
+    # Add hyperlinks to the "Query" column (replace 'Query' with the actual
+    # column name if different)
+    d$Query <- paste0("<a href='https://www.ncbi.nlm.nih.gov/protein/",
+                      d$Query, "' target='_blank' >", d$Query, "</a>")
+
+    # Create and return the DataTable
+    dt <- DT::datatable(d,
+              rownames = FALSE,
+              filter = "top",  # Enable filtering
+              extensions = c("Buttons"),  # Enable buttons (e.g., CSV export)
+              callback = DT::JS(c(
+                  '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );'
+              )),
+              escape = FALSE,  # Allow HTML rendering (for the hyperlinks)
+              options = list(
+                  dom = "frlBtip",  # Layout for the table (filters, buttons, etc.)
+                  pageLength = 25,  # Number of rows per page
+                  paging = TRUE,  # Enable pagination
+                  # Enable regex search
+                  search = list(caseInsensitive = TRUE, regex = TRUE),
+                  language = list(
+                      searchPlaceholder = "Regex or Text..."
+                  ),
+                  buttons = list(
+                      list(
+                          extend = "colvis",
+                          text = "Add/remove column(s)"
+                      ),
+                      list(
+                          extend = "csv",
+                          text = "Download",
+                          filename = "molevolvr-query",
+                          exportOptions = list(
+                            # Export all data, not just visible page
+                              modifier = list(page = "all")
+                          )
+                      )
+                  ),
+                  scrollX = FALSE,  # Disable horizontal scrolling
+                  fixedHeader = FALSE,  # Disable fixed header
+                  columnDefs = list(
+                    # Hide columns not in 'viewing_cols'
+                      list(visible = FALSE, targets = hide_cols)
+                  )
+              )
+    )
+
+    return(dt)
+}
+
+# Function to read and return the FASTA file contents
+readFastaData <- function(fasta_path) {
+
+    # Check if analysis is loaded and the file path is not empty
+    if (fasta_path == "" || !file.exists(fasta_path)) {
+        stop("FASTA file path is invalid or the file does not exist.")
+    }
+
+    # Read the content of the FASTA file
+    fasta_content <- read_file(fasta_path)
+
+    return(fasta_content)
+}
+
+getFastaData <- function(fasta_path) {
+    if (is.null(fasta_path) || fasta_path == "") {
+        stop("Error: FASTA path is not provided.")
+    }
+    return(read_file(fasta_path))  # Read and return the FASTA data
+}
+
+# Function to get domain sequences (assumes `data` is a predefined object)
+getDomData <- function(data) {
+    return(data@domainSeqs)  # Return domain sequences
+}
+
+# Function to get MSA data from a given path
+getMSAData <- function(msa_path) {
+    if (is.null(msa_path) || msa_path == "") {
+        stop("Error: MSA path is not provided.")
+    }
+    # Attempt to read the file and handle potential errors
+    if (file.exists(msa_path)) {
+        return(read_file(msa_path))
+    } else {
+        warning(sprintf("Warning: Unable to read the file at path '%s'. Ignoring...", msa_path))
+        return(NULL)  # Return NULL if the file cannot be read
+    }
+}
+
+# Function to generate a heatmap
+getQueryHeatmap <- function(query_data_df,
+                                   heatmap_select = "All",
+                                   heatmap_color = "blue") {
+
+    # Check if analysis is loaded and query data exists
+    if (nrow(query_data_df) == 0) {
+        stop("No query data available.")
+    }
+
+    # Filter queries based on user selection
+    if (heatmap_select == "All") {
+        queries <- unique(query_data_df$QueryName)
+        prot <- query_data_df %>%
+            filter(Lineage != "") %>%
+            tidyr::drop_na(Lineage)
+    } else {
+        queries <- heatmap_select
+        prot <- query_data_df %>%
+            filter(grepl(heatmap_select, QueryName, ignore.case = TRUE)) %>%
+            filter(Lineage != "") %>%
+            tidyr::drop_na(Lineage)
+    }
+
+    # Validate that the Lineage column has values
+    if (all(unique(prot$Lineage) == "")) {
+        stop("Lineage column not found for selected proteins.
+             See the FAQ for possible reasons/solutions.")
+    }
+
+  plotLineageQuery(prot, queries = queries, colname = "QueryName",
+                       cutoff = 100, color = heatmap_color)
+}
+
+# Function to retrieve domain architecture columns
+getQueryDomArchCols <- function(query_data_df) {
+    # Check if query data exists
+    if (nrow(query_data_df) == 0) {
+        stop("No query data available.")
+    }
+
+    # Get column names
+    cols <- colnames(query_data_df)
+
+    # Filter domain architecture columns, excluding repeats
+    domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)]
+
+    # Identify columns that are completely NA
+    na_cols <- names(query_data_df)[apply(query_data_df, 2,
+                                          function(x) all(is.na(x)))]
+
+    # Remove NA columns from domain architecture columns
+    domarch_cols <- setdiff(domarch_cols, na_cols)
+
+    # Include SignalP_GRAM_POSITIVE if present
+    if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) {
+        domarch_cols <- setdiff(domarch_cols, "SignalP_GRAM_POSITIVE")  # Remove first
+        domarch_cols <- append(domarch_cols, "SignalP_GRAM_POSITIVE")    # Append it last
+    }
+
+    # Remove prefix from column names
+    domarch_cols <- substring(domarch_cols, first = 9)
+
+    return(domarch_cols)
+}
+
+# Function to generate main data table
+getMainTable <- function(data_df, main_select = NULL) {
+    # Validate input data
+    if (nrow(data_df) == 0) {
+        stop("No data available. Please ensure you have uploaded your data
+             correctly. See Help documentation or contact JRaviLab
+             (janani.ravi[AT]cuanschutz[DOT]edu).")
+    }
+
+    # Define columns to view
+    viewing_cols <- c("AccNum", "QueryName", "Name", "Lineage",
+                      "Species", "Length", "PcPositive",
+                      "DomArch.Pfam")
+
+    # Filter data based on selection
+    if (!is.null(main_select)) {
+        d <- data_df %>%
+            filter(QueryName %in% main_select) %>%
+            droplevels()
+    } else {
+        d <- data_df
+    }
+
+    # Identify columns to hide
+    hide_cols <- which(!colnames(d) %in% viewing_cols)
+    hide_cols <- hide_cols - 1
+
+    # Create hyperlinks for AccNum
+    d$AccNum <- paste0("<a href='https://www.ncbi.nlm.nih.gov/protein/",
+                       d$AccNum, "' target='_blank'>", d$AccNum, "</a>")
+
+    # Generate DataTable
+    datatable_output <- DT::datatable(d,
+                        rownames = FALSE,
+                        filter = "top",
+                        callback = DT::JS(c(
+                            '$(\'div.has-feedback input[type="search"]\').attr( "placeholder", "Search..." );'
+                        )),
+                        extensions = c("Buttons"),
+                        escape = FALSE,
+                        options = list(
+                            dom = "frlBtip",
+                            pageLength = 25,
+                            paging = TRUE,
+                            search = list(caseInsensitive = TRUE, regex = TRUE),
+                            language = list(searchPlaceholder = "Regex or Text...",
+                                            filterPlaceholder = "Test"),
+                            buttons = list(
+                                list(extend = "colvis", text = "Add/remove column(s)"),
+                                list(
+                                    extend = "csv",
+                                    text = "Download",
+                                    filename = "molevolvr-homolog",
+                                    exportOptions = list(modifier = list(page = "all"))
+                                )
+                            ),
+                            scrollX = FALSE,
+                            fixedHeader = FALSE,
+                            columnDefs = list(list(visible = FALSE, targets = hide_cols))
+                        )
+    )
+
+    return(datatable_output)
+}
+
+totalCounts <- function(prot, column = "DomArch", lineage_col = "Lineage",
+                         cutoff = 90, RowsCutoff = FALSE, digits = 2
+                         # type = "GC"
+) {
+  # column <- sym(column)
+
+  prot <- select(prot, {{ column }}, {{ lineage_col }}) %>%
+    filter(!is.na({{ column }}) & !is.na({{ lineage_col }})) %>%
+    filter({{ column }} != "")
+
+  prot <- summarizeByLineage(prot, column, by = lineage_col, query = "all")
+  col_count <- prot %>%
+    group_by(!!sym(column)) %>%
+    summarise(totalcount = sum(count))
+
+  total <- left_join(prot, col_count, by = column)
+
+  sum_count <- sum(total$count)
+  total <- total %>%
+    mutate("IndividualCountPercent" = totalcount / sum_count * 100) %>%
+    arrange(-totalcount, -count)
+
+  cumm_percent <- total %>%
+    select({{ column }}, totalcount) %>%
+    distinct() %>%
+    mutate("CumulativePercent" = 0)
+  total_counter <- 0
+  for (x in length(cumm_percent$totalcount):1) {
+    total_counter <- total_counter + cumm_percent$totalcount[x]
+    cumm_percent$CumulativePercent[x] <- total_counter / sum_count * 100
+  }
+
+  cumm_percent <- cumm_percent %>% select(CumulativePercent, {{ column }})
+
+  total <- total %>% left_join(cumm_percent, by = as_string(column))
+
+  # Round the percentage columns
+  total$CumulativePercent <- total$CumulativePercent %>% round(digits = digits)
+  total$IndividualCountPercent <- total$IndividualCountPercent %>% round(digits = digits)
+
+  if (RowsCutoff) {
+    # If total counts is being used for plotting based on number of rows,
+    # don't include other observations that fall below the cummulative percent cutoff
+    # , but that have the same 'totalcount' number as the cutoff observation
+    total <- total %>% filter(CumulativePercent >= 100 - cutoff - .0001)
+    return(total)
+  }
+
+  # Include observations that fall below the cummulative percent cutoff,
+  # but that have the same 'totalcount' as the cutoff observation
+  t <- total %>% filter(CumulativePercent >= 100 - cutoff)
+  if (length(t) == 0) {
+    cutoff_count <- 0
+  } else {
+    cutoff_count <- t$totalcount[nrow(t)]
+  }
+
+  total <- total %>%
+    filter(totalcount >= cutoff_count) %>%
+    ungroup()
+
+  return(total)
+}
+
+getDomArchTotalCounts <- function(DA_Prot, DACutoff, DA_col, app_data) {
+  # Check if ipr_path is not empty
+  if (app_data@ipr_path == "") {
+    stop("ipr_path is missing.")
+  }
+
+  # Calculate total counts with the specified cutoff and column
+  prot_tc <- totalCounts(DA_Prot, cutoff = DACutoff, column = DA_col)
+
+  # Replace all instances of ">" in the Lineage column with "_"
+  prot_tc$Lineage <- map(prot_tc$Lineage, ~ str_replace_all(.x, ">", "_")) %>%
+    unlist()
+
+  # Return the processed data
+  prot_tc
+}
+
+# Function to generate Domain Architecture Linear Table
+getDomArchLinearTable <- function(DA_col, ipr_path, DA_TotalCounts_value) {
+    # Check if ipr_path is valid
+    if (ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Check if DA_col is provided
+    if (is.null(DA_col)) {
+        stop("DA_col input is required.")
+    }
+
+    da_col <- sym(DA_col)
+
+    # Perform the data transformation
+    DA_TotalCounts_value %>%
+        group_by({{ da_col }}, totalcount, CumulativePercent) %>%
+        summarize(LineageCount = n()) %>%
+        select({{ da_col }}, LineageCount, totalcount, CumulativePercent) %>%
+        arrange(-totalcount)
+
+    # Generate the DAlin count table
+    DAlin_table <- DT::datatable(
+      DA_TotalCounts_value,
+      selection = "single",
+      extensions = c("Buttons"),
+      options = list(
+        pageLength = 25,
+        dom = "frlBtip",
+        buttons = list(
+          list(
+            extend = "csv",
+            text = "Download",
+            filename = "MolEvolvR_domarch",
+            exportOptions = list(
+              modifier = list(page = "all")
+            )
+          )
+        ),
+        scrollX = FALSE,
+        paging = TRUE,
+        fixedHeader = FALSE,
+        fixedColumns = list(leftColumns = 0, rightColumns = 0)
+      )
+    )
+
+    return(DAlin_table)
+}
+
+# Function to generate the Domain Architecture Lineage Plot
+getDomArchHeatmapPlot <- function(DA_col, DACutoff,
+                                     DA_Prot, DA_lin_color,
+                                     ipr_path) {
+    # Check if ipr_path is valid
+    if (ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Filter the protein data for plotting
+    prot <- DA_Prot %>%
+        filter(Lineage != "") %>%
+        drop_na(Lineage)
+
+    # Create the plot
+    plot <- plotLineageDA(
+        prot,
+        colname = DA_col,
+        cutoff = DACutoff,
+        RowsCutoff = FALSE,
+        color = DA_lin_color
+    )
+
+    # Convert ggplot to plotly object
+    plot <- plotly::ggplotly(plot)
+    plot <- plot %>% plotly::layout(xaxis = list(side = "top"))
+
+    return(plot)
+}
+
+# Function to generate the Domain Architecture Network
+getDomNetwork <- function(DA_col, DACutoff, DA_Prot,
+                                    networkLayout, ipr_path) {
+    # Check if ipr_path is valid
+    if (ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Prepare the selected protein data
+    dn_data <- DA_Prot
+    col <- sym(DA_col)  # Convert DA_col to a symbol for use with dplyr
+    dn_data[[col]] <- str_replace_all(dn_data[[col]], " ", "_")
+
+    # Filter data based on the specified column
+    dn_data <- dn_data %>%
+        drop_na(!!col) %>%
+        filter(!!col != "")
+
+    # Generate the domain network
+    res <- createDomainNetwork(
+        prot = dn_data,
+        column = DA_col,
+        domains_of_interest = ".*",
+        cutoff = DACutoff,
+        layout = networkLayout
+    )
+
+    # Validate the result
+    if (any(res == "error")) {
+        stop("Not enough nodes to construct a network.
+             Try increasing 'Total Count Cutoff'.")
+    }
+
+    # Add export functionality
+    visNetwork::addExport(res)
+    res <- res %>% visExport(
+        type = "png",
+        name = "domain_architecture-network",
+        float = "left",
+        label = "Download plot",
+        style = "
+      background-color: #ffffff;
+      color: #333333;
+      border-color: #cccccc;
+      padding: 6px 12px;
+      font-size: 14px;
+      line-height: 1.42857143;
+      border-radius: 4px;
+      cursor: pointer;
+      display: inline-block;
+      margin-bottom: 0;
+      font-weight: normal;
+      text-align: center;
+      vertical-align: middle;
+      touch-action: manipulation;
+      user-select: none;
+      background-image: none;
+      text-decoration: none;
+    "
+    )
+
+    return(res)
+}
+
+# Function to retrieve and clean Domain Architecture data
+getDomArchProt <- function(app_data, DASelect) {
+    # Check if the ipr_path is valid
+    if (app_data@ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Validate domain architecture
+    # Check if ipr_path is not empty
+    if (app_data@ipr_path == "") {
+        stop("ipr_path is missing.")
+    }
+
+    # Check if the data frame has rows
+    if (nrow(app_data@df) <= 0) {
+        stop("No data available. Please ensure you have uploaded your data correctly. See Help documentation or contact JRaviLab (janani.ravi[AT]cuanschutz[DOT]edu).")
+    }
+
+    # Check if there are domain architecture columns
+    # if (length(domarch_cols) == 0) {
+    #    stop("Please ensure uploaded data has domain architecture columns.")
+    # }
+
+    # Retrieve app data and clean domain architecture columns
+    df_app_data <- app_data@df
+
+    # Domain architecture column cleanup
+    df_app_data <- df_app_data %>%
+        mutate(across(starts_with("DomArch"), cleanString))
+
+    # Filter based on user selection
+    if (DASelect == "All") {
+        return(df_app_data)
+    } else {
+        return(df_app_data %>% filter(grepl(DASelect, QueryName,
+                                            ignore.case = TRUE)))
+    }
+}
+
+# Function to retrieve domain architecture columns
+getDomArchCols <- function(app_data, DASelect) {
+    # Check if app data DataFrame is not empty
+    if (nrow(app_data@df) <= 0) {
+        stop("No data available in app data.")
+    }
+
+    # Check if ipr_path is valid
+    if (app_data@ipr_path == "") {
+        stop("InterPro path is empty.")
+    }
+
+    # Get column names
+    cols <- colnames(app_data@df)
+
+    # Filter domain architecture columns
+    domarch_cols <- cols[grepl("^DomArch", cols) & !grepl("repeats$", cols)]
+
+    # Retrieve the data frame
+    domarch_data <- app_data@df
+
+    # Filter data frame based on user selection
+    if (DASelect != "All") {
+        domarch_data <- domarch_data %>% filter(QueryName == DASelect)
+    }
+
+    # Identify columns that are completely NA or empty
+    na_cols <- apply(domarch_data, 2, function(x) {
+        all(is.na(x)) || all(x == "")
+    })
+    na_cols <- names(domarch_data)[na_cols]
+
+    # Remove NA columns from the domain architecture columns
+    domarch_cols <- setdiff(domarch_cols, na_cols)
+
+    # Ensure "SignalP_GRAM_POSITIVE" is included if it exists
+    if ("SignalP_GRAM_POSITIVE" %in% domarch_cols) {
+        domarch_cols <- append(domarch_cols[!domarch_cols %in%
+                                              "SignalP_GRAM_POSITIVE"],
+                               "SignalP_GRAM_POSITIVE")
+    }
+
+    return(domarch_cols)
+}
+
+# Function to generate the IPR genes plot
+getDomArchIPRGenesPlot <- function(app_data, query_iprDatabases,
+                                   query_iprVisType, DASelect) {
+
+    if (app_data@ipr_path == "") {
+        stop("IPR path is not set.")
+    }
+
+    # Validate the data frame
+    df <- app_data@df
+    if (nrow(df) == 0) {
+        stop("No data available. Please ensure you have uploaded
+             your data correctly.")
+    }
+
+    if (is.null(query_iprDatabases) || length(query_iprVisType) == 0) {
+        stop("Please select an analysis.")
+    }
+
+    # Determine the name to use
+    if (length(app_data@cln_path) == 0 || is.null(df$PcPositive)) {
+        name_column <- ifelse("name" %in% colnames(df), "name", "AccNum")
+        name_column <- "Name"
+
+        # Generate the plot using the web version
+        plot <- plotIPR2VizWeb(
+            infile_ipr = app_data@ipr_path,
+            accessions = df$Name,
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
+            name = name_column
+        )
+    } else {
+        # Generate the plot using the local version
+        plot <- plotIPR2Viz(
+            infile_ipr = app_data@ipr_path,
+            infile_full = df,
+            accessions = unique(df$Name),
+            analysis = query_iprDatabases,
+            group_by = query_iprVisType,
+            topn = 20,
+            query = DASelect
+        )
+    }
+
+    return(plot)
+}
+
+# Function to filter proteins for phylogeny
+filterPhylogenyProteins <- function(app_data, phylo_select) {
+    # Get the data frame from app_data
+    df <- app_data@df
+
+    # Check if the data frame is empty
+    if (nrow(df) == 0) {
+        stop("No data available in app_data.
+             Please ensure it has been loaded correctly.")
+    }
+
+    # Filter the data based on user selection
+    if (phylo_select == "All") {
+        filtered_df <- df
+    } else {
+        filtered_df <- df %>%
+            filter(grepl(phylo_select, QueryName, ignore.case = TRUE)) %>%
+            filter(Lineage != "")
+    }
+
+    return(filtered_df)
+}
+
+# Function to retrieve representative accession numbers
+getRepAccNum <- function(app_data, phylo_select,
+                                                 msa_reduce_by, msa_rep_num,
+                                                 rval_phylo) {
+
+    if (rval_phylo()) {
+        return(app_data@df$AccNum)
+    } else {
+        switch(msa_reduce_by,
+               "Species" = rep_acc_species(),
+               "Lineage" = rep_acc_lineage(),
+               "DomArch" = {
+                   if (app_data@ipr_path == "") {
+                       stop("IPR path is empty. Please ensure it is set correctly.")
+                   }
+                   seqs <- find_top_acc(infile_full = app_data@df,
+                                        n = msa_rep_num,
+                                        query = phylo_select)
+                   if (is.null(seqs)) {
+                       stop("No sequences found.")
+                   }
+                   return(seqs)
+               },
+               "Full" = {
+                   tmp <- app_data@df %>% filter(QueryName == phylo_select)
+                   return(tmp$AccNum)
+               },
+               stop("Invalid selection for msa_reduce_by.")
+        )
+    }
+}
+
+getDomArchPlot <- function(ipr_path, query_names,
+                                              analysis_type, group_by) {
+    # Check if the input path is provided
+    if (is.null(ipr_path) || ipr_path == "") {
+        stop("Error: Input path for IPR data is not provided.")
+    }
+
+    # Validate that at least one domain is found
+    if (length(query_domarch_cols()) < 1) {
+        stop("Error: No domains found in the input sequences.")
+    }
+
+    n <- "Name"
+    plot <- plotIPR2VizWeb(
+        infile_ipr = ipr_path,
+        accessions = query_names,
+        analysis = analysis_type,
+        group_by = group_by,
+        name = n
+    )
+
+    return(plot)  # Return the generated plot
+}
+
+# Function to convert accessions to names
+acc2Name <- function(app_data) {
+    # Check if "AccNum" is a column in the data
+    if (!("AccNum" %in% colnames(app_data@df))) {
+        stop("Column 'AccNum' not found in data.")
+    }
+
+    # Check if "Name" column exists and create the output data frame
+    if ("Name" %in% colnames(app_data@df)) {
+        # Select "AccNum" and "Name"
+        df <- select(app_data@df, "AccNum", "Name")
+    } else {
+        # Create a data frame with "AccNum" and assign "AccNum" to "Name"
+        df <- app_data@df %>%
+            select("AccNum") %>%
+            mutate(Name = AccNum)
+    }
+
+    return(df)
+}
+
+repAccNums <- function(phylo, msa_reduce_by, msa_rep_num, PhyloSelect, app_data) {
+  # If `phylo` is true, return all `AccNum` values from `app_data`
+  if (phylo) {
+    return(app_data@df$AccNum)
+  } else {
+    # Switch based on `msa_reduce_by` value
+    tmp <- filter(app_data@df)
+    rep_acc_species <- createRepresentativeAccNum(tmp, reduced = "Species", accnum_col = "AccNum")
+
+    # Get representative accession numbers by "Lineage"
+    rep_acc_lineage <- createRepresentativeAccNum(tmp, reduced = "Lineage", accnum_col = "AccNum")
+    switch(msa_reduce_by,
+           "Species" = rep_acc_species,
+           "Lineage" = rep_acc_lineage,
+           "DomArch" = {
+             # Check if `ipr_path` is not empty
+             if (app_data@ipr_path == "") {
+               stop("ipr_path is missing.")
+             }
+             # Find top sequences by accession number based on criteria
+             seqs <- getTopAccByLinDomArch(infile_full = app_data@df, n = msa_rep_num, query = PhyloSelect)
+             if (is.null(seqs)) {
+               stop("No sequences found.")
+             }
+             return(seqs)
+           },
+           "Full" = {
+             # Filter data for matching `QueryName` and return `AccNum`
+             tmp <- app_data@df %>% filter(QueryName == PhyloSelect)
+             return(tmp$AccNum)
+           }
+    )
+  }
+}
+
+seqTree <- function(fasta_filepath){
+  my_seqs <- readAAStringSet(fasta_filepath) #, format="fasta", seek.first.rec=T)
+  my_seqs_msa <- msa(my_seqs)
+  my_seqs_msa_aln <- msaConvert(my_seqs_msa, type="seqinr::alignment")
+
+  #below was commented out, does it need to change as one of the parameters? the bottom keeps
+  d <- dist.alignment(my_seqs_msa_aln, "identity")
+  #as.matrix(d)[2:5, "HBA1_Homo_sapiens", drop=FALSE]
+
+  ## Phylogenetic tree
+  ## using package ape
+  ## build neighbor-joining tree
+  seqTree <- nj(d)
+  #plot(seqTree, main="Phylogenetic Tree of MSA")
+  groupInfo <- split(seqTree$tip.label,
+                     gsub("_\\w+", "", seqTree$tip.label))
+  seqTree <- groupOTU(seqTree, groupInfo)
+  # ggtree(seqTree, aes(color=group),
+  #        layout='circular') +
+  #   geom_tiplab(size=1, aes(angle=angle))
+  #https://yulab-smu.top/treedata-book/chapter4.html
+  #offs <- 0
+  tree <- ggtree(seqTree, branch.length = "dN_vs_dS") + theme_tree2(axis.line.x = element_blank(), axis.text.x = element_blank(), axis.ticks.x = element_blank())
+  msaplot(tree, fasta=fasta_filepath, offset=0.5, bg_line = TRUE) + geom_tiplab(align=TRUE, linesize=0.5, size=3)
+
+}
diff --git a/inst/report/report_template.Rmd b/inst/report/report_template.Rmd
new file mode 100644
index 00000000..ab02bdeb
--- /dev/null
+++ b/inst/report/report_template.Rmd
@@ -0,0 +1,497 @@
+---
+title: MolEvolvR
+output:
+  html_document: 
+    self_contained: yes
+params:
+  rs_interproscan_visualization: default
+  proximity_network: default
+  sunburst: default
+  data: default
+  queryDataTable: default
+  fastaDataText: default
+  heatmap: default
+  query_data: default
+  query_domarch_cols: default
+  query_iprDatabases: default
+  query_iprVisType: default
+  mainTable: default
+  DALinTable: default
+  DALinPlot: default
+  DANetwork: default
+  DA_Prot: default
+  domarch_cols: default
+  DA_Col: default
+  DACutoff: default
+  da_interproscan_visualization: default
+  phylo_sunburst_levels: default
+  phylo_sunburst: default
+  lineage_table: default
+  ## Tree
+  tree_msa_tool: default ##input
+  ## MSA
+  repAccNums: default
+  msa_rep_num: default ##input
+  app_data: default
+  PhyloSelect: default ##input
+  acc2Name: default
+  rval_phylo: default
+  query_pin: default
+  msa_reduce_by: default
+---
+
+---
+subtitle: "`r glue::glue('Retrieval Code: {params$query_pin}')`"
+---
+
+<style type="text/css">
+.main-container {
+  max-width: 100% !important;
+  margin: auto;
+}
+</style>
+
+<style>
+.full-results-button {
+  vertical-align: middle;
+  // -moz-box-shadow: 0px 10px 14px -7px #000000;
+  // -webkit-box-shadow: 0px 10px 14px -7px #000000;
+  // box-shadow: 0px 10px 14px -7px #000000;
+  -moz-border-radius: 4px;
+  -webkit-border-radius: 4px;
+  border-radius: 4px;
+  border: 1px solid #46b8da;
+  display: inline-block;
+  font-size: 15px; 
+  padding: 6px 10px;
+  text-decoration: none; /*no underline!!*/
+  cursor: pointer;
+}
+.full-results-button:active { /*simulate movement*/
+  position: relative;
+  top: 1px;
+}
+</style>
+
+```{r mine,results="asis",engine="js", echo=FALSE}
+// stumble through the tabset(s)
+setTimeout(function(){
+  document.querySelector('a#FullResultButton1').addEventListener('click', doIt);
+  function doIt(){
+    document.querySelector('a[href*="#domain-architecture-2"]').click(); //Second(-2) Domain Architecture Section (reuse at different heading levels)
+  }
+}, 100); // slow your roll
+
+setTimeout(function(){
+  document.querySelector('a#FullResultButton2').addEventListener('click', doIt);
+  function doIt(){
+    document.querySelector('a[href*="#phylogeny-1"]').click(); //Second Phylogeny(-1) Section (reuse at different heading levels)
+  }
+}, 100); 
+
+setTimeout(function(){
+  document.querySelector('a#FullResultButton3').addEventListener('click', doIt);
+  function doIt(){
+    document.querySelector('a[href*="#homolog-data"]').click(); //Homolog Data Section 
+  }
+}, 100); 
+```
+
+```{r setup, include = FALSE}
+## Globally Suppress warnings and messaging
+knitr::opts_chunk$set(warning = FALSE, message = FALSE)
+## Create Sunburst Report Function
+cpcols <- c(
+    "#AFEEEE", "#DDA0DD", "#EE2C2C", "#CDBE70", "#B0B099",
+    "#8B2323", "#EE7600", "#EEC900", "chartreuse3", "#0000FF",
+    "#FFD900", "#32CD32", "maroon4", "cornflowerblue", "darkslateblue",
+    "#AB82FF", "#CD6889", "#FFA07A", "#FFFF00", "#228B22",
+    "#FFFFE0", "#FFEC8B", "peru", "#668B8B", "honeydew",
+    "#A020F0", "grey", "#8B4513", "#191970", "#00FF7F",
+    "lemonchiffon", "#66CDAA", "#5F9EA0", "#A2CD5A", "#556B2F",
+    "#EEAEEE", "thistle4", "#473C8B", "#FFB6C1", "#8B1C62",
+    "#FFE4B5", "black", "#FF7F50", "#FFB90F", "#FF69B4", "#836FFF",
+    "#757575", "#CD3333", "#EE7600", "#CDAD00", "#556B2F", "#7AC5CD"
+    )
+lineage_sunburst_report <- function(prot, lineage_column = "Lineage",
+    type = "sunburst",
+    levels = 2, colors = NULL, legendOrder = NULL, showLegend = TRUE, maxLevels = 5) {
+    lin_col <- sym(lineage_column)
+
+    # ensure they don't exceed maxLevels, although this should
+    if (!is.null(maxLevels) && levels > maxLevels) {
+        levels <- maxLevels
+    }
+
+    levels_vec <- c()
+    for (i in 1:levels)
+    {
+        levels_vec <- append(levels_vec, paste0("level", i))
+    }
+
+    # Take lineage column and break into the first to levels
+    prot <- prot %>%
+        select({{ lin_col }}) %>%
+        arrange(desc({{ lin_col }})) %>%
+        drop_na({{ lin_col }})
+    protLevels <- prot %>% separate({{ lin_col }}, into = levels_vec, sep = ">")
+    # Count the occurrance of each group of levels
+    protLevels <- protLevels %>%
+        group_by_at(levels_vec) %>%
+        summarise(size = n())
+    protLevels <- protLevels %>% arrange()
+    tree <- d3_nest(protLevels, value_cols = "size")
+
+    # Plot sunburst
+    if (type == "sunburst") {
+        result <- sunburst(tree, legend = list(w = 225, h = 15, r = 5, s = 5), colors = cpcols, legendOrder = legendOrder)
+    } else if (type == "sund2b") {
+        result <- sund2b(tree)
+    }
+
+    if (showLegend) {
+        return(
+            htmlwidgets::onRender(
+                result,
+                "function(el, x) {
+          jQuery('.sunburst-togglelegend', el)
+            .attr('checked', 'true')
+            .attr('data-html2canvas-ignore', 'true');
+          jQuery('.sunburst-legend', el).css('visibility', '');
+
+          // create a button to download the sunburst
+          // (relies on html2canvas being included in the page)
+
+          // FIXME: consider pulling this all out into a js library
+          //  so that we can apply it to other components.
+
+          const downloadBtn = jQuery('<button id=\"download-sunburst\">Download Sunburst</button>')
+            .css({'position': 'absolute', 'right': '5px', 'top': '5px'})
+            .attr('data-html2canvas-ignore', 'true')
+            .appendTo(el);
+
+          saveAs = (blob, fileName) => {
+              const link = document.createElement('a');
+              link.download = fileName
+              link.href = URL.createObjectURL(blob);
+              link.click();
+              URL.revokeObjectURL(link.href);
+          }
+
+          downloadBtn.click(() => {
+            html2canvas(el, { scale: 4, logging: false }).then(canvas => {
+              canvas.toBlob(function(blob) {
+                saveAs(blob, 'sunburst.png');
+              });
+            });
+          });
+        }"
+            )
+        )
+    }
+
+    return(result)
+}
+```
+
+# {.tabset}
+
+## Results Summary
+
+An overview of the protein analysis. To view the full results, explore the additional tabs.
+
+### Domain Architecture 
+
+Visualizations and summaries for protein domains.
+
+<a class="full-results-button" id="FullResultButton1" style="background: #5bc0de; color:#fff; font-face: bold;">View full results</a>
+
+
+#### Interproscan Visualization
+
+```{r, echo=FALSE}
+params$rs_interproscan_visualization
+```
+
+#### Proximity Network
+
+```{r, echo=FALSE}
+params$proximity_network
+```
+
+### Phylogeny
+
+Visualizations for protein evolution.
+
+<a class="full-results-button" id="FullResultButton2" style="background: #5bc0de; color:#fff; font-face: bold;">View full results</a>
+
+#### Sunburst
+
+```{r, echo=FALSE}
+library(dplyr)
+library(tidyr)
+lineage_sunburst_report(params$sunburst, "Lineage", levels = 2)
+```
+
+### Data
+
+Summary table of proteins including domain architectures, phylogeny, and homologs, when applicable.
+
+<a class="full-results-button" id="FullResultButton3" style="background: #5bc0de; color:#fff; font-face: bold;">View full results</a>
+
+```{r, echo=FALSE}
+params$data
+```
+
+
+## Query Data {.tabset}
+
+Input data, additional metadata, and preliminary analyses of query protein(s).
+
+### Data Table
+
+The data table provides a summary of the sequences submitted, or \"queried\", for analysis. The preview shown can be extended by using \"Add/remove column(s)\" to see info about other taxonomic classes as well as domain architecuture codes from databases other than the default (Pfam).
+
+```{r, echo=FALSE}
+params$queryDataTable
+```
+
+### FASTA
+
+Uploaded amino acid FASTA sequence(s).
+
+`r glue::glue(params$fastaDataText)`
+
+### Query Heatmap
+
+A heatmap of submitted sequences and their respective taxonomic lineages.
+
+```{r, echo=FALSE}
+params$heatmap
+```
+
+### Domain Architecture
+
+Visualizations and analyses of all query and homologous protein domains, structural or functional subunits, and their architectures.
+
+```{r, echo=FALSE}
+if(is.null(params$query_iprDatabases)){
+    choices <- params$query_domarch_cols
+    if ("PfamA" %in% choices & "Phobius" %in% choices) {
+      analysis_type <- c("PfamA", "Phobius")
+      } else {
+        default <- choices[1]
+        }
+  } else {
+    analysis_type <- params$query_iprDatabases
+    }
+
+if(is.null(params$query_iprVisType)){
+  analysis_group <- 'Analysis'
+  } else {
+    analysis_group <- params$query_iprVisType
+    }
+
+if(length(params$query_domarch_cols) >= 1 ) {
+  DomArchPlot <- plotIPR2VizWeb(
+    infile_ipr = params$query_data@ipr_path,
+    accessions = params$query_data@df$QueryName,
+    analysis = analysis_type, group_by = analysis_group, name = "Name"
+    )
+    DomArchPlot
+} else {
+  "No domains found in the input sequences."
+  }
+```
+
+## Homolog Data
+
+Full set of homologs of query sequences, including their lineage and domain architecture info.
+
+```{r, echo=FALSE}
+params$mainTable
+```
+
+## Domain Architecture {.tabset}
+
+Summary and visualizations of protein motifs/subunits (domains) and their configurations within the query protein(s) (domain architectures).
+
+### Table
+
+```{r, echo=FALSE}
+params$DALinTable
+```
+
+### Heatmap
+
+```{r, echo=FALSE}
+params$DALinPlot
+```
+
+### Network
+
+```{r, echo=FALSE}
+params$DANetwork
+```
+
+```{r, echo=FALSE}
+if (length(params$domarch_cols) == 0) {
+  choices <- c("None")
+  vals <- "None"
+  selected <- "None"
+  } else {
+    vals <- params$domarch_cols
+    choices <- substring(vals, first = 9)
+    selected <- if (length(choices) >= 2) choices[1:2] else choices[1]
+    }
+
+if (is.null(params$DA_col)) {
+  da_col <- vals[[1]]
+  } else {
+    da_col <- params$DA_Col
+  }
+
+plot_data <- params$DA_Prot
+plot_data[[da_col]] <- str_replace_all(plot_data[[da_col]], " ", "_")
+createWordCloudElement(plot_data, colname = da_col, cutoff = params$DACutoff, UsingRowsCutoff = F)
+```
+
+### Interproscan Visualization
+
+```{r, echo=FALSE}
+params$da_interproscan_visualization
+```
+
+### UpSet Plot
+
+```{r, echo=FALSE}
+upset_plot_data <- params$DA_Prot
+upset_plot_data[[da_col]] <- str_replace_all(upset_plot_data[[da_col]], " ", "_")
+final_plot <- plotUpSet(upset_plot_data, colname = da_col, cutoff = params$DACutoff)
+domains <- upset_plot_data %>% 
+  dplyr::pull(da_col)
+n_unique_domains <- domains %>%
+    unique() %>% 
+    length()
+
+if (n_unique_domains > 1) {
+  final_plot
+  } else {
+    stringr::str_glue("UpSet plot requires more than 1 unique domain (only {n_unique_domains} present). Try selecting more proteins.")
+    }
+```
+
+## Phylogeny {.tabset}
+
+Visualizations of phyletic patterns, sequence similarity, and evolution of related proteins.
+
+### Sunburst
+
+```{r, echo=FALSE}
+lineage_sunburst_report(params$phylo_sunburst, lineage_column = "Lineage_long_na", type = "sunburst", levels = params$phylo_sunburst_levels)
+```
+
+### Tree
+
+```{r, include=FALSE}
+if (is.null(params$tree_msa_tool)) {
+  tree_msa_tool <- 'ClustalO' ## initialize with default value
+  } else {
+    tree_msa_tool <- params$tree_msa_tool
+    }
+
+if (is.null(params$msa_rep_num)) {
+  msa_rep_num <- 3 ## initialize with default value
+  } else {
+    msa_rep_num <- params$msa_rep_num
+    } 
+```
+
+```{r, include=FALSE}
+if(length(params$repAccNums) >= 3 ) {
+  rep <- params$repAccNums[1:msa_rep_num]
+  if (!params$rval_phylo) {
+    seqs <- readAAStringSet(params$app_data@fasta_path)
+    names(seqs) <- sub(" .*", "", names(seqs))
+    query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
+    query_accession <- unique(query_accession$Query)
+    query <- seqs[query_accession]
+    names(query) <- unique(params$app_data@df$QueryName)
+    query <- AAStringSet(query)
+  }
+  # Generate Fasta File
+  rep_fasta_path <- tempfile()
+
+  acc2FA(rep, outpath = rep_fasta_path, "sequential")
+  rename_fasta(rep_fasta_path, rep_fasta_path,
+    replacement_function = mapAcc2Name,
+    acc2name = params$acc2Name
+  )
+  if (!params$rval_phylo) {
+    writeXStringSet(query, rep_fasta_path, append = TRUE)
+  }
+  rep_msa_path <- tempfile()
+  alignFasta(rep_fasta_path, tree_msa_tool, rep_msa_path)
+}
+```
+
+```{r, echo=FALSE}
+library(msa)
+library(ape)
+library(tidytree)
+library(ggtree)
+if(length(params$repAccNums) >= 3 ) {
+  seqTree(fasta_filepath = rep_msa_path) 
+ } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
+```
+
+### MSA
+
+```{r, include=FALSE}
+if(length(params$repAccNums) >= 3 ) {
+  if (!params$rval_phylo) {
+    seqs <- readAAStringSet(params$app_data@fasta_path)
+    names(seqs) <- sub(" .*", "", names(seqs))
+    query_accession <- params$app_data@df %>% filter(!duplicated(QueryName))
+    query_accession <- unique(query_accession$Query)
+    query <- seqs[query_accession]
+    # names(query) <- params$PhyloSelect
+    query <- AAStringSet(query)
+    }
+
+  # Generate Fasta File
+  rep_fasta_path <- tempfile()
+  acc2FA(rep, outpath = rep_fasta_path, "sequential")
+  rename_fasta(rep_fasta_path, rep_fasta_path,
+    replacement_function = mapAcc2Name,
+    acc2name = params$acc2Name
+  )
+  if (!params$rval_phylo) {
+    writeXStringSet(query, rep_fasta_path, append = TRUE)
+  }
+
+  # Call MSA2PDF
+  msa_pdf_path <- tempfile()
+  msa_parent_dir <- dirname(params$app_data@fasta_path)
+  msa_subdir_path <- file.path(msa_parent_dir, "msa_figs")
+  if (!dir.exists(msa_subdir_path)) {
+    dir.create(msa_subdir_path, recursive = TRUE)
+    message("Subdirectory 'msa_figs' created at: ", msa_subdir_path)
+  } else {
+    message("Subdirectory 'msa_figs' already exists at: ", msa_subdir_path)
+  }
+  msa_prefix <- paste0(msa_subdir_path, "/")
+  post_fix <- paste("msa", params$query_pin, params$PhyloSelect, params$msa_reduce_by, ".pdf", sep = "_")
+
+  msa_pdf_path <- paste0(msa_prefix, post_fix)
+
+  createMSA_PDF(fasta_path = rep_fasta_path, msa_pdf_path)
+  }
+```
+
+```{r msa_pdf, echo = FALSE, out.width = "95%", out.height  = "800px"}
+# apt-get install texlive
+if(length(params$repAccNums) >= 3 ) {
+  knitr::include_graphics(msa_pdf_path)
+  } else {"Not enough representative sequences: try changing the 'Reduce By' field."}
+```
diff --git a/man/combineFiles.Rd b/man/combineFiles.Rd
index 81464fa6..afbc6c5f 100644
--- a/man/combineFiles.Rd
+++ b/man/combineFiles.Rd
@@ -20,7 +20,7 @@ The search is recursive (i.e., it will look in subdirectories as well).}
 Default is "*full_analysis.tsv".}
 
 \item{delim}{Character. The delimiter used in the input files.
-Default is tab ("\t").}
+Default is tab \code{"\t"}.}
 
 \item{skip}{Integer. The number of lines to skip at the beginning of each file.
 Default is 0.}
diff --git a/man/getTopAccByLinDomArch.Rd b/man/getTopAccByLinDomArch.Rd
index 0eeb0610..9e91d606 100644
--- a/man/getTopAccByLinDomArch.Rd
+++ b/man/getTopAccByLinDomArch.Rd
@@ -6,7 +6,7 @@
 \usage{
 getTopAccByLinDomArch(
   infile_full,
-  DA_col = "DomArch.Pfam",
+  DA_col = "DomArch.PfamA",
   lin_col = "Lineage_short",
   n = 20,
   query