JRaviLab · Cateline · Oct 20, 2024 · Oct 21, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/case_studies/CARD/Bug-Drug Code.R b/case_studies/CARD/Bug-Drug Code.R
@@ -17,61 +17,120 @@ untar("broadstreet-v3.3.0.tar", exdir = "CARD_data")
 
 
 # Map CARD Short Name
-# Install and Load dplyr
-if (!require("dplyr")) {
-  install.packages("dplyr")
-  library(dplyr)
-} else {
-  library(dplyr)
+# Install and Load dplyr and readr
+packages <- c("dplyr", "readr")
+
+for (pkg in packages) {
+  if (!require(pkg, character.only = TRUE)) {
+    install.packages(pkg)
+    library(pkg, character.only = TRUE)
+  } else {
+    library(pkg, character.only = TRUE)
+  }
+}
+
+# Parse the required files using readr::read_delim
+aro_index <- read_delim("CARD_data/aro_index.tsv", delim = "\t", col_names = TRUE)
+antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE)
+pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE)
+
+
+# Extract pathogen, gene, drug, and include Protein.Accession from 'CARD Short Name'
+library(dplyr)
+library(purrr)
+library(stringr)
+
+# Extract pathogen, gene, drug, and include Protein.Accession from 'CARD Short Name'
+extract_card_info <- function(card_short_name, drug_class, `Protein Accession`, `DNA Accession`) {
+  # Split the CARD Short Name by underscores
+  split_names <- unlist(strsplit(card_short_name, "_"))
+
+  # Initialize variables with defaults
+  pathogen <- NA
+  gene <- NA
+  drug <- drug_class  # Default to Drug Class column
+
+  # Determine the information based on the split names and patterns
+  if (length(split_names) == 1) {
+    # Gene only (single part entry)
+    gene <- split_names[1]
+    pathogen <- "MULTI"  # Assign MULTI as default for pathogen
+  } else if (all(toupper(split_names) == split_names)) {
+    # Gene complex (all uppercase entries)
+    gene <- card_short_name  # Entire entry as gene
+    pathogen <- "MULTI"
+  } else if (length(split_names) == 2) {
+    # Pathogen-Gene scenario
+    pathogen <- split_names[1]
+    gene <- split_names[2]
+  } else if (length(split_names) == 3) {
+    # Pathogen-Gene-Drug scenario
+    pathogen <- split_names[1]
+    gene <- split_names[2]
+    drug <- split_names[3]  # Assign drug from the split entry
+  }
+
+  # If both pathogen and gene are NA, classify as complex gene
+  if (is.na(pathogen) && is.na(gene)) {
+    gene <- card_short_name  # Assign entire CARD Short Name as gene
+    pathogen <- "MULTI"      # Default to MULTI for pathogen
+  }
+
+  # Handle Protein Accession
+  if (is.na(`Protein Accession`) || `Protein Accession` == "") {
+    `Protein Accession` <- `DNA Accession`  # Use DNA Accession if Protein Accession is NA
+  }
+
+  return(list(Pathogen = pathogen, Gene = gene, Drug = drug, Protein_Accession = `Protein Accession`))
 }
 
-#  Read the required files
-aro_index <- read.delim("CARD_data/aro_index.tsv", sep = "\t", header = TRUE)
-antibiotics_data <- read.delim("CARD_data/shortname_antibiotics.tsv", sep = "\t", header = TRUE)
-pathogens_data <- read.delim("CARD_data/shortname_pathogens.tsv", sep = "\t", header = TRUE)
+# Apply the function to the data frame
+resistance_profile_data <- aro_index %>%
+  mutate(extracted_info = pmap(list(`CARD Short Name`, `Drug Class`, `Protein Accession`, `DNA Accession`),
+                               extract_card_info)) %>%
+  unnest_wider(extracted_info)
+
+# View the resulting data frame
+print(resistance_profile_data)
 
+# Define a relative path for saving the data
+output_path <- file.path("CARD_data", "resistance_profile_data.tsv")
 
-# Extract pathogen, gene, drug, and include Protein.Accession from 'CARD.Short.Name'
-aro_index_clean <- aro_index %>%
-  mutate(
-    pathogen = sapply(strsplit(CARD.Short.Name, "_"), `[`, 1),  # Extract pathogen
-    gene = sapply(strsplit(CARD.Short.Name, "_"), `[`, 2),      # Extract gene
-    drug = ifelse(sapply(strsplit(CARD.Short.Name, "_"), length) == 3,   # Extract drug
-                  sapply(strsplit(CARD.Short.Name, "_"), `[`, 3), NA),
-    Protein.Accession = Protein.Accession  # Include the Protein.Accession column
-  )
+# Save resistance_profile_data to the specified path
+write_delim(resistance_profile_data, output_path, delim = "\t")
 
-# Merge aro_index_clean with the antibiotics_data and pathogens_data
-# For merging with antibiotics_data
-merged_data_antibiotics <- left_join(aro_index_clean, antibiotics_data,
-                                     by = c("drug" = "AAC.Abbreviation"))
+# Load data
+resistance_profile_data <- read_delim(output_path, delim = "\t", col_names = TRUE)
+antibiotics_data <- read_delim("case_studies/CARD/CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE)
+pathogens_data <- read_delim("case_studies/CARD/CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE)
 
-# For merging with pathogens_data
-merged_data_pathogens <- left_join(merged_data_antibiotics, pathogens_data,
-                                   by = c("pathogen" = "Abbreviation"))
 
-# View the resulting merged data
-head(merged_data_pathogens)
 
+# Merge the extracted resistance profile data with antibiotics_data on Drug
+merged_data_antibiotics <- left_join(
+  resistance_profile_data,
+  antibiotics_data,
+  by = c("Drug" = "AAC Abbreviation"), # Adjusting for abbreviations between datasets
+  relationship = "many-to-many"
+)
 
-#filter out rows where pathogen is empty
-cleaned_data <- merged_data_pathogens %>%
-  distinct() %>%
-  filter(!is.na(Pathogen)) # Use 'Pathogen' instead of 'pathogen'
-View(cleaned_data)
+# Merge the result with pathogens_data on Pathogen, renaming Pathogen.y to Pathogen_Full_Name
+merged_data_pathogens <- left_join(
+  merged_data_antibiotics,
+  pathogens_data,
+  by = c("Pathogen" = "Abbreviation")
+) %>%
+  rename(Pathogen_Full_Name = Pathogen.y)
 
-# Group by Pathogen, Gene, Drug, and Protein.Accession, then summarize Antibiotic information
-summarized_data <- cleaned_data %>%
-  group_by(Pathogen = Pathogen, Gene = gene, Drug = drug, Protein_Accession = Protein.Accession) %>%
-  summarize(Antibiotic_Info = paste(unique(Molecule), collapse = ", ")) %>%
-  arrange(Pathogen, Gene, Drug, Protein_Accession)
+# Assign "Multi-species" to Pathogen_Full_Name where Pathogen values are "MULTI"
+merged_data_pathogens <- merged_data_pathogens %>%
+  mutate(Pathogen_Full_Name = if_else(Pathogen == "MULTI", "Multi-species", Pathogen_Full_Name))
 
-# Filter for Staphylococcus aureus and DAP (Bug-Drug of Interest)
-staph_aureus_dap_combinations <- summarized_data %>%
-  filter(Pathogen == "Staphylococcus aureus", Drug == "DAP")
 
-# View the filtered data
-head(staph_aureus_dap_combinations)
+# Assign "Multi-class" to Molecule where Drug values are full names (not abbreviations)
+merged_data_pathogens <- merged_data_pathogens %>%
+  mutate(Molecule = if_else(grepl(" ", Drug) | grepl("-", Drug), "Multi-class", Molecule))
+
 
 #FASTA sequences
 #Install and Load required packages
@@ -90,6 +149,10 @@ library(rentrez)
 library(XML)
 library(stringr)
 
+# Filter for the target drug (DAP) and pathogen (Staphylococcus aureus)
+filtered_data <- merged_data_pathogens %>%
+  filter(Drug == "DAP", Pathogen_Full_Name == "Staphylococcus aureus")
+
 
 # Fetch FASTA sequence from Entrez
 fetch_fasta_sequence <- function(protein_accession) {
@@ -99,19 +162,19 @@ fetch_fasta_sequence <- function(protein_accession) {
                                        id = protein_accession,
                                        rettype = "fasta",
                                        retmode = "text")
-
+    
     if (!is.null(fasta_seq)) {
       # Ensure the first line starts with ">"
       if (!grepl("^>", fasta_seq[1])) {
         fasta_seq[1] <- paste0(">", fasta_seq[1])
       }
-
+      
       # Split the sequence into lines
       lines <- str_split(fasta_seq, "\n")[[1]]
-
+      
       # Join the lines back together
       fasta_seq <- paste(lines, collapse = "\n")
-
+      
       return(fasta_seq)
     } else {
       warning(paste("Failed to retrieve FASTA sequence for protein accession:", protein_accession))
@@ -123,26 +186,54 @@ fetch_fasta_sequence <- function(protein_accession) {
   })
 }
 
-# Loop through staph_aureus_dap_combinations to fetch and save FASTA sequences
+
+# Define the output file for the FASTA sequences
+output_fasta_file <- "Staph_aureus_Daptomycin_sequences.fasta"
-output_fasta_file <- "Staph_aureus_Daptomycin_sequences.fasta"
+output_fasta_file <- "Saur_dap_arg.fasta"
-output_fasta_file <- "Staph_aureus_Daptomycin_sequences.fasta"
+output_fasta_file <- "Saur_dap_arg.fasta"
+
+# Initialize an empty character vector to store the sequences
 combined_sequences <- character()
 
-for (i in 1:nrow(staph_aureus_dap_combinations)) {
-  # Fetch FASTA sequence for each protein accession
-  protein_accession <- staph_aureus_dap_combinations$Protein_Accession[i]
+# Loop through each Protein Accession in the filtered data to fetch sequences
+for (i in 1:nrow(filtered_data)) {
+  # Get the Protein Accession ID
+  protein_accession <- filtered_data$Protein_Accession[i]
+
+  cat("Fetching sequence for Protein Accession:", protein_accession, "\n")  # Debugging message
+
+  # Fetch the FASTA sequence
   fasta_sequence <- fetch_fasta_sequence(protein_accession)
-
+
+  # If the sequence was fetched successfully, add it to the combined_sequences vector
   if (!is.null(fasta_sequence)) {
     combined_sequences <- c(combined_sequences, fasta_sequence)
+    cat("Successfully fetched sequence for:", protein_accession, "\n")
-    cat("Successfully fetched sequence for:", protein_accession, "\n")
+    cat("Successfully fetched sequences for:", protein_accession, "\n")
-    cat("Successfully fetched sequence for:", protein_accession, "\n")
+    cat("Successfully fetched sequences for:", protein_accession, "\n")
+  } else {
+    cat("Failed to fetch sequence for:", protein_accession, "\n")
   }
 }
 
-# Save the combined FASTA sequences
-filename <- "Staph_aureus_Daptomycin_sequences.fasta"
+# Check if there are any fetched sequences
+if (length(combined_sequences) > 0) {
+  # Save all fetched sequences to a FASTA file
+  writeLines(combined_sequences, output_fasta_file)
+  cat("Sequences saved to", output_fasta_file, "\n")
+} else {
+  cat("No sequences were fetched, so no FASTA file was created.\n")
+}
+
+# Read the contents of the file
+fasta_contents <- readLines(output_fasta_file)
+
+# Print the contents
+cat(fasta_contents, sep = "\n")
+
+
+
+
+
+
+
+
 
-writeLines(combined_sequences, filename)
 
-# Read the FASTA file
-fasta_content <- readLines(filename)
 
-# Display the contents
-cat(fasta_content, sep = "\n")
diff --git a/case_studies/CARD/CARD_data/CARD-Download-README.txt b/case_studies/CARD/CARD_data/CARD-Download-README.txt
@@ -1,70 +1,32 @@
-CARD Download README
+# CARD README
+
+## Source:
-## Source:
+## Source
-## Source:
+## Source
+This dataset was downloaded from the Comprehensive Antibiotic Resistance Database (CARD) in 2024-10 at https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2
-This dataset was downloaded from the Comprehensive Antibiotic Resistance Database (CARD) in 2024-10 at https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2
+This dataset and associated README were downloaded from the Comprehensive Antibiotic Resistance Database (CARD) (2024-10) at https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2.
-This dataset was downloaded from the Comprehensive Antibiotic Resistance Database (CARD) in 2024-10 at https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2
+This dataset and associated README were downloaded from the Comprehensive Antibiotic Resistance Database (CARD) (2024-10) at https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2.
 
-Use or reproduction of these materials, in whole or in part, by any commercial 
-organization whether or not for non-commercial (including research) or commercial purposes
-is prohibited, except with written permission of McMaster University. Commercial uses are
-offered only pursuant to a written license and user fee. To obtain permission and begin 
-the licensing process, see http://card.mcmaster.ca/about.
 
 CITATION:
 
 Alcock et al. 2023. "CARD 2023: expanded curation, support for machine learning, and resistome 
 prediction at the Comprehensive Antibiotic Resistance Database" Nucleic Acids Research, 
 51, D690-D699. https://pubmed.ncbi.nlm.nih.gov/36263822/
 
-CARD SHORT NAMES:
-
-A CARD-specific abbreviation for AMR gene names associated with Antibiotic Resistance
-Ontology terms, often not based on the literature. This is used for programmatic and 
-compatibility purposes and is not ontologically relevant. Each ontology term with an 
-associated AMR detection model has a CARD Short Name that appears in CARD data files 
-and output generated by RGI. If the original gene name is less than 15 characters, the 
-CARD short name is identical; if the gene name is greater than 15 characters, the CARD 
-Short Name has been abbreviated by CARD curators specifically to identify the proper 
-gene or protein name. All CARD Short Names are unique and have whitespace characters 
-replaced by underscore characters. The convention for pathogen names is capitalized 
-first letter of the genus followed by the lowercase first three letters of the species 
-name. The antibiotic abbreviations are from https://journals.asm.org/journal/aac/abbreviations
-plus some custom abbreviations by the CARD curators. Simple CARD Short Names often do not
-involve either, e.g. CTX-M-15, but where applicable the CARD Short Names follow pathogen_gene
-or pathogen_gene_drug. The full lists of abbreviations can be found in the enclosed files: 
-
-"shortname_antibiotics.tsv"
-"shortname_pathogens.tsv"
-
-FASTA:
-
-Nucleotide and corresponding protein FASTA downloads are available as separate files for 
-each model type.  For example, the "protein homolog" model type contains sequences of
-antimicrobial resistance genes that do not include mutation as a determinant of resistance
-- these data are appropriate for BLAST analysis of metagenomic data or searches excluding 
-secondary screening for resistance mutations. In contrast, the "protein variant" model 
-includes reference wild type sequences used for mapping SNPs conferring antimicrobial 
-resistance - without secondary mutation screening, analyses using these data will include 
-false positives for antibiotic resistant gene variants or mutants.
-
-MODELS:
+## CARD SHORT NAMES
 
-The file "card.json" contains the complete data for all of CARD's AMR detection models, 
-including reference sequences, SNP mapping data, model parameters, and ARO classification.
-"card.json" is used by the Resistance Gene Identifier software. 
+The CARD database uses standardized abbreviations, known as CARD Short Names, for AMR gene names associated with Antibiotic Resistance Ontology terms. These names are created for compatibility across data files and outputs from the Resistance Gene Identifier (RGI). Short Names for genes with 15 or fewer characters retain the original gene name, while longer names are abbreviated to uniquely represent each gene or protein. All CARD Short Names replace whitespace with underscores. For pathogen names, CARD follows the convention of capitalizing the first letter of the genus followed by the first three letters of the species in lowercase. Where applicable, CARD Short Names adopt formats such as “pathogen_gene,” “pathogen_gene_drug,” or “gene_drug.” Full lists of these abbreviations are available in the provided files:
 
-Values for "High Confidence TB", "Moderate Confidence TB", "Minimal Confidence TB", and
-"Indeterminate Confidence TB" were obtained from https://platform.reseqtb.org.
+shortname_antibiotics.tsv
+shortname_pathogens.tsv"
 
-INDEX FILES:
 
-The file "aro_index.tsv" contains a list of ARO tagging of GenBank accessions stored in 
-CARD.
+## FASTA
 
-The file "aro_categories.tsv" contains a list of ARO terms used to categorize all entries
-in CARD and results via the RGI. These categories reflect AMR gene family, target drug 
-class, and mechanism of resistance.
+The FASTA files included here contain retrieved sequences of antimicrobial resistance genes.
 
-The file "aro_categories_index.tsv" contains a list a GenBank accessions stored 
-in CARD cross-referenced with the major categories within the ARO. These categories 
-reflect AMR gene family, target drug class, and mechanism of resistance, so GenBank 
-accessions may have more than one cross-reference. For more complex categorization of 
-the data, use the full ARO available at http://card.mcmaster.ca/download.
+## Data Files Downloaded
-## Data Files Downloaded
+## Data files downloaded
-## Data Files Downloaded
+## Data files downloaded
+aro_index.tsv
-aro_index.tsv
+`aro_index.tsv`
-aro_index.tsv
+`aro_index.tsv`
+This file contains an index of ARO (Antibiotic Resistance Ontology) identifiers with associated GenBank accessions. Each entry includes information used to link antibiotic resistance genes to GenBank sequences.
+shortname_antibiotics.tsv
-shortname_antibiotics.tsv
+`shortname_antibiotics.tsv`
-shortname_antibiotics.tsv
+`shortname_antibiotics.tsv`
+Contains standardized abbreviations for antibiotics used in CARD’s short names. These abbreviations, which follow conventions from the American Society for Microbiology (ASM) and additional custom terms, provide a uniform naming system for antibiotics referenced within CARD data.
 
-The file "snps.txt" lists the SNPs associated with specific detection models.
+shortname_pathogens.tsv
-shortname_pathogens.tsv
+`shortname_pathogens.tsv`
-shortname_pathogens.tsv
+`shortname_pathogens.tsv`
+Lists standardized abbreviations for pathogens used in CARD. Each abbreviation represents pathogen names in a condensed format, commonly the first letter of the genus followed by the first three letters of the species. This abbreviation system simplifies pathogen referencing in CARD outputs.
diff --git a/.../Staph_aureus_Daptomycin_sequences5.fasta → ...D/Staph_aureus_Daptomycin_sequences.fasta b/.../Staph_aureus_Daptomycin_sequences5.fasta → ...D/Staph_aureus_Daptomycin_sequences.fasta