JRaviLab · the-mayer · Aug 7, 2025
diff --git a/R/dcia_from_molevol_scripts.R b/R/dcia_from_molevol_scripts.R
@@ -0,0 +1,233 @@
+#' Plot Protein Length Distribution
+#'
+#' This function plots the distribution of protein lengths from the provided data.
+#' 
+#' @references 
+#' \url{https://journals.asm.org/doi/epub/10.1128/jb.00163-22}
+#' Figure 2A
+#'
+#' @param data A data frame containing protein information.
+#' @param query_data A data frame containing protein query information.
+#' @param superkingdom_filter A character vector specifying the superkingdom(s) to filter the data by. Valid
+#' entries include "bacteria", "virus", "eukaryota", "archaea" or a combination. 
+#' Default is NULL, meaning no filtering.
+#' @param query_filter A character vector specifying one or many Query Proteins.
+#' Default is NULL, meaning no filtering.
+#' @param query_lineage_filter A character vector specifying one or many Query Lineage Proteins.
+#' Default is NULL, meaning no filtering. 
+#' @param show.median Logical. Whether or not to draw a median line on the plot
+#' @param show.percentiles Logical. Whether or not to draw 25th and 75th percentile lines on the plot.
+#'
+#' @return A ggplot object showing the distribution of protein lengths.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' plotProteinLengthDist(data, superkingdom_filter = "bacteria")
+#' }
+
+# Plot protein length distribution as a box plot
+plotProteinLengthDist_byLineage <- function(data, query_data, superkingdom_filter = NULL, query_filter = NULL, query_lineage_filter = NULL, show.median = TRUE, show.percentiles = TRUE) {
+  ## Verify Required Subject Length column (`SLength`) is present in data
+  slength_col_check <- 'SLength' %in% colnames(data)
+  if(slength_col_check == FALSE) {
+    rlang::abort('SLength column is not present in supplied data and is required for this plot.', class = 'data-error')
+    } 
+  ## Create Regex Filters Based on user input
+  ### Superkingdom
+  if(any(!str_detect(superkingdom_filter, regex('bacteria|virus|eukaryota|archaea', ignore_case = TRUE)))) {
+    rlang::abort(message = 'Please filter by "bacteria", "virus", "eukaryota", "archaea" or a combination as a vector')
+    } else {
+      superkingdom_filter_regex <- str_trim(glue::glue_collapse(superkingdom_filter, sep = '|'))
+      }
+  ### Query Protein
+  query_filter_regex <- str_trim(glue::glue_collapse(query_filter, sep = '|'))
+  query_lineage_filter_regex <- str_trim(glue::glue_collapse(query_lineage_filter, sep = '|'))
+
+  ### Query Data Prep
+  query_data_filter <- 
+    query_data %>% 
+    distinct(QueryName, Lineage)
+
+
+  ## Data Prep: 
+  ## - create superkingdom column, parsing from Lineage
+  ## - remove empty or NA Lineage values
+  df <- 
+    data %>% 
+    mutate(superkingdom = str_extract(Lineage, regex('^[^>]+'))) %>% 
+    filter(Lineage != "") %>% 
+    drop_na(Lineage)
+
+  ## Apply Filters: Always remove 'candidatus|candidate' along with any user specified filtering
+  df_filtered <- 
+    df %>%
+    filter(!str_detect(Lineage, regex('candidatus|candidate', ignore_case = TRUE))) %>% 
+    { if(is.null(superkingdom_filter)) . else filter(., str_detect(Lineage, regex(superkingdom_filter_regex, ignore_case = TRUE))) } %>% 
+    { if(is.null(query_filter)) . else filter(., str_detect(QueryName, regex(query_filter_regex, ignore_case = TRUE))) } %>% 
+    distinct(AccNum, .keep_all = TRUE) 
+
+  df_filtered_long <- 
+    df_filtered %>%
+    pivot_longer(c(SLength, SStart, SEnd, QLength, QStart, QEnd), names_to = c("type", 'attribute'), names_sep = 1, values_to = "length") %>% 
+    mutate(attribute = fct_relevel(attribute, c("Length", "Start", "End"))) %>% 
+    filter(attribute == 'Length')
+
+  ## Calculate Stats: Median, 25th and 75th percentiles
+  df_filtered_stats <- 
+    df_filtered_long %>% 
+    {if(is.null(query_lineage_filter)) . else filter(., str_detect(Lineage, regex(query_lineage_filter_regex, ignore_case = TRUE))) } %>% 
+    group_by(type, attribute) %>%
+    mutate(median_length = median(length), 
+           first_quartile_length = quantile(length, c(0.25)), 
+           third_quartile_length = quantile(length, c(0.75))
+          )
+  ## Create Plot
+  if(nrow(df_filtered_long) > 0){
+    ProteinLengthDist <- 
+      df_filtered_long %>% 
+      filter(type == 'S') %>% 
+      ggplot(aes(x=Lineage, y = length, fill = superkingdom)) +
+      geom_boxplot(outlier.alpha = 0.5, outlier.color = "black") +
+      scale_fill_viridis(discrete = TRUE, alpha=0.6, guide = 'none') +
+      labs(y = "Length (in amino acids)", x = "Lineage(s)") +
+      theme_minimal() +
+      theme(text = element_text(size = 18),
+            axis.text.x = element_text(angle = 90, 
+                                       vjust = 0.8, 
+                                       hjust=0
+                                      ),
+            legend.title = element_text(family = "Inter", face = "bold"),
+            legend.text = element_text(family = "Inter"),
+            axis.line = element_line(colour = "black"),
+            axis.ticks = element_line(colour = "black"),
+            panel.grid.major = element_blank(),
+            panel.grid.minor = element_blank(),
+            plot.background = element_rect("white", linewidth = 0),
+            panel.border = element_blank()) +
+      geom_point(data = df_filtered_stats %>% filter(type == 'Q') %>% inner_join(query_data_filter) %>% distinct(), aes(color = QueryName, shape = QueryName), size = 3) +
+      scale_color_brewer(palette = 'Dark2') +
+      scale_x_discrete(position = "top") +
+      guides(color = guide_legend("Query Protein"), shape = guide_legend("Query Protein"))
+    if(show.median & show.percentiles){
+      ProteinLengthDist +
+        geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = median_length), color = "blue") +
+        geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = first_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5) +
+        geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = third_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5)
+      } else if (show.median) {
+        ProteinLengthDist +
+          geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = median_length), color = "blue")
+        } else if (show.percentiles){
+          ProteinLengthDist +
+            geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = first_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5) +
+            geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = third_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5)
+          } else {
+            ProteinLengthDist
+            }
+    } else {
+      rlang::abort(message = 'No data returned with configured Superkingdom/Query filter(s)', class = 'filter-error')
+      }
+}
+
+# Plot protein length distribution as a box plot
+plotProteinLengthDist_byQuery <- function(data, query_data, superkingdom_filter = NULL, query_filter = NULL, query_lineage_filter = NULL, show.median = TRUE, show.percentiles = TRUE) {
+  ## Verify Required Subject Length column (`SLength`) is present in data
+  slength_col_check <- 'SLength' %in% colnames(data)
+  if(slength_col_check == FALSE) {
+    rlang::abort('SLength column is not present in supplied data and is required for this plot.', class = 'data-error')
+    } 
+  ## Create Regex Filters Based on user input
+  ### Superkingdom
+  if(any(!str_detect(superkingdom_filter, regex('bacteria|virus|eukaryota|archaea', ignore_case = TRUE)))) {
+    rlang::abort(message = 'Please filter by "bacteria", "virus", "eukaryota", "archaea" or a combination as a vector')
+    } else {
+      superkingdom_filter_regex <- str_trim(glue::glue_collapse(superkingdom_filter, sep = '|'))
+      }
+  ### Query Protein
+  query_filter_regex <- str_trim(glue::glue_collapse(query_filter, sep = '|'))
+  query_lineage_filter_regex <- str_trim(glue::glue_collapse(query_lineage_filter, sep = '|'))
+
+  ### Query Data Prep
+  query_data_filter <- 
+    query_data %>% 
+    distinct(QueryName, Lineage)
+
+
+  ## Data Prep: 
+  ## - create superkingdom column, parsing from Lineage
+  ## - remove empty or NA Lineage values
+  df <- 
+    data %>% 
+    mutate(superkingdom = str_extract(Lineage, regex('^[^>]+'))) %>% 
+    filter(Lineage != "") %>% 
+    drop_na(Lineage)
+
+  ## Apply Filters: Always remove 'candidatus|candidate' along with any user specified filtering
+  df_filtered <- 
+    df %>%
+    filter(!str_detect(Lineage, regex('candidatus|candidate', ignore_case = TRUE))) %>% 
+    { if(is.null(superkingdom_filter)) . else filter(., str_detect(Lineage, regex(superkingdom_filter_regex, ignore_case = TRUE))) } %>% 
+    { if(is.null(query_filter)) . else filter(., str_detect(QueryName, regex(query_filter_regex, ignore_case = TRUE))) } %>% 
+    distinct(AccNum, .keep_all = TRUE) 
+
+  df_filtered_long <- 
+    df_filtered %>%
+    pivot_longer(c(SLength, SStart, SEnd, QLength, QStart, QEnd), names_to = c("type", 'attribute'), names_sep = 1, values_to = "length") %>% 
+    mutate(attribute = fct_relevel(attribute, c("Length", "Start", "End"))) %>% 
+    filter(attribute == 'Length')
+
+  ## Calculate Stats: Median, 25th and 75th percentiles
+  df_filtered_stats <- 
+    df_filtered_long %>% 
+    {if(is.null(query_lineage_filter)) . else filter(., str_detect(Lineage, regex(query_lineage_filter_regex, ignore_case = TRUE))) } %>% 
+    group_by(type, attribute) %>%
+    mutate(median_length = median(length), 
+           first_quartile_length = quantile(length, c(0.25)), 
+           third_quartile_length = quantile(length, c(0.75))
+          )
+  ## Create Plot
+  if(nrow(df_filtered_long) > 0){
+    ProteinLengthDist <- 
+      df_filtered_long %>% 
+      filter(type == 'S') %>% 
+      ggplot(aes(x=Query, y = length, fill = superkingdom)) +
+      geom_boxplot(outlier.alpha = 0.5, outlier.color = "black") +
+      scale_fill_viridis(discrete = TRUE, alpha=0.6, guide = 'none') +
+      labs(y = "Length (in amino acids)", x = "Query") +
+      theme_minimal() +
+      theme(text = element_text(size = 18),
+            axis.text.x = element_text(angle = 90, 
+                                       vjust = 0.8, 
+                                       hjust=0
+                                      ),
+            legend.title = element_text(family = "Inter", face = "bold"),
+            legend.text = element_text(family = "Inter"),
+            axis.line = element_line(colour = "black"),
+            axis.ticks = element_line(colour = "black"),
+            panel.grid.major = element_blank(),
+            panel.grid.minor = element_blank(),
+            plot.background = element_rect("white", linewidth = 0),
+            panel.border = element_blank()) +
+      # geom_point(data = df_filtered_stats %>% filter(type == 'Q') %>% inner_join(query_data_filter) %>% distinct(), aes(color = QueryName, shape = QueryName), size = 3) +
+      scale_color_brewer(palette = 'Dark2') +
+      scale_x_discrete(position = "top") +
+      guides(color = guide_legend("Query Protein"), shape = guide_legend("Query Protein"))
+    if(show.median & show.percentiles){
+      ProteinLengthDist +
+        geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = median_length), color = "blue") +
+        geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = first_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5) +
+        geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = third_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5)
+      } else if (show.median) {
+        ProteinLengthDist +
+          geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = median_length), color = "blue")
+        } else if (show.percentiles){
+          ProteinLengthDist +
+            geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = first_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5) +
+            geom_hline(data = df_filtered_stats %>% filter(type == 'S'), aes(yintercept = third_quartile_length), linetype = 'dashed', color = "light blue", alpha = 0.5)
+          } else {
+            ProteinLengthDist
+            }
+    } else {
+      rlang::abort(message = 'No data returned with configured Superkingdom/Query filter(s)', class = 'filter-error')
+      }
+}