Ekin-Kahraman
diff --git a/‎results/figures/pathway_diagram.png‎
-7 Bytes b/‎results/figures/pathway_diagram.png‎
-7 Bytes
diff --git a/‎results/figures/qc_library_size.png‎
-1.9 KB b/‎results/figures/qc_library_size.png‎
-1.9 KB
diff --git a/‎results/figures/volcano_plot.png‎
1.1 KB b/‎results/figures/volcano_plot.png‎
1.1 KB
diff --git a/‎scripts/000_install_dependencies.R‎
Lines changed: 30 additions & 0 deletions b/‎scripts/000_install_dependencies.R‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎scripts/00_get_data.R‎
Lines changed: 6 additions & 14 deletions b/‎scripts/00_get_data.R‎
Lines changed: 6 additions & 14 deletions
diff --git a/‎scripts/01_qc.R‎
Lines changed: 9 additions & 13 deletions b/‎scripts/01_qc.R‎
Lines changed: 9 additions & 13 deletions
diff --git a/‎scripts/02_pca.R‎
Lines changed: 4 additions & 0 deletions b/‎scripts/02_pca.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎scripts/03_deseq2.R‎
Lines changed: 8 additions & 7 deletions b/‎scripts/03_deseq2.R‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎scripts/04_visualisation_volcano.R‎
Lines changed: 8 additions & 10 deletions b/‎scripts/04_visualisation_volcano.R‎
Lines changed: 8 additions & 10 deletions
@@ -0,0 +1,30 @@
+#!/usr/bin/env Rscript
+# Install all required packages for the analysis pipeline
+
+message("Checking and installing required packages...\n")
+
+# CRAN packages
+cran_pkgs <- c("ggplot2", "dplyr", "pheatmap", "RColorBrewer", "ggrepel", "scales")
+
+for (pkg in cran_pkgs) {
+  if (!requireNamespace(pkg, quietly = TRUE)) {
+    message("Installing ", pkg, "...")
+    install.packages(pkg, repos = "https://cloud.r-project.org")
+  }
+}
+
+# Bioconductor
+if (!requireNamespace("BiocManager", quietly = TRUE)) {
+  install.packages("BiocManager", repos = "https://cloud.r-project.org")
+}
+
+bioc_pkgs <- c("DESeq2", "edgeR", "GEOquery", "clusterProfiler", "org.Hs.eg.db", "enrichplot")
+
+for (pkg in bioc_pkgs) {
+  if (!requireNamespace(pkg, quietly = TRUE)) {
+    message("Installing ", pkg, "...")
+    BiocManager::install(pkg, update = FALSE, ask = FALSE)
+  }
+}
+
+message("\nAll packages installed. Run: source('run_all.R')")
@@ -6,75 +6,67 @@ library(GEOquery)
 
 GEO_ID <- "GSE152075"
 
-# Setup directories
 dir.create("data/raw", recursive = TRUE, showWarnings = FALSE)
 dir.create("data", recursive = TRUE, showWarnings = FALSE)
 
 message("Downloading ", GEO_ID)
 
-# Download supplementary files
 getGEOSuppFiles(GEO_ID, makeDirectory = TRUE, baseDir = "data/raw")
 
-# Extract archives
 supp_dir <- file.path("data/raw", GEO_ID)
 tar_files <- list.files(supp_dir, pattern = "\\.tar$", full.names = TRUE)
 invisible(lapply(tar_files, untar, exdir = supp_dir))
 
-# Find count matrix
 files <- list.files(supp_dir, recursive = TRUE, full.names = TRUE)
 count_file <- files[grepl("count.*\\.txt", basename(files), ignore.case = TRUE)][1]
 
 if (is.na(count_file)) stop("Count matrix not found")
 
 message("Reading: ", basename(count_file))
 
-# Load counts with gene IDs as rownames
-raw_counts <- read.table(count_file, header = TRUE, row.names = 1, 
+raw_counts <- read.table(count_file, header = TRUE, row.names = 1,
                          check.names = FALSE, stringsAsFactors = FALSE)
 raw_counts <- as.matrix(raw_counts)
 storage.mode(raw_counts) <- "integer"
 raw_counts <- raw_counts[complete.cases(raw_counts), ]
 
 message(nrow(raw_counts), " genes × ", ncol(raw_counts), " samples")
 
-# Get sample metadata
 gse <- getGEO(GEO_ID, GSEMatrix = TRUE)
 pheno <- pData(gse[[1]])
 
-# Parse SARS-CoV-2 status from characteristics
 positivity <- pheno$characteristics_ch1
 condition <- ifelse(
   grepl("positivity:\\s*pos", positivity, ignore.case = TRUE), "positive",
   ifelse(grepl("positivity:\\s*neg", positivity, ignore.case = TRUE), "negative", NA)
 )
 
-# Extract sample IDs (POS_### or NEG_###) from titles
+n_na <- sum(is.na(condition))
+if (n_na > 0) {
+  warning(sprintf("%d samples have unknown condition; check GEO metadata format", n_na))
+}
+
 sample_ids <- sub(".*\\b(POS_\\d+|NEG_\\d+)\\b.*", "\\1", pheno$title)
 
-# Build metadata - ensure it stays a dataframe
 metadata <- data.frame(
   sample_id = sample_ids,
   condition = condition,
   row.names = sample_ids,
   stringsAsFactors = FALSE
 )
 
-# Convert condition to factor
 metadata$condition <- factor(metadata$condition, levels = c("negative", "positive"))
 
-# Align counts with metadata
 common <- intersect(colnames(raw_counts), rownames(metadata))
 raw_counts <- raw_counts[, common, drop = FALSE]
 metadata <- metadata[common, , drop = FALSE]
 
-# Verify structure
 stopifnot(is.data.frame(metadata))
 stopifnot("condition" %in% colnames(metadata))
 
 message(sum(metadata$condition == "negative"), " negative, ",
         sum(metadata$condition == "positive"), " positive")
 
-# Save
 saveRDS(raw_counts, "data/counts_raw.rds")
 saveRDS(metadata, "data/metadata.rds")
 
 
@@ -4,16 +4,22 @@
 library(DESeq2)
 library(edgeR)
 library(ggplot2)
+library(scales)
+
+if (!file.exists("data/counts_raw.rds")) {
+  stop("File not found: data/counts_raw.rds. Run scripts/00_get_data.R first.")
+}
+if (!file.exists("data/metadata.rds")) {
+  stop("File not found: data/metadata.rds. Run scripts/00_get_data.R first.")
+}
 
 counts <- readRDS("data/counts_raw.rds")
 metadata <- readRDS("data/metadata.rds")
 
-# Align samples
 metadata <- metadata[colnames(counts), , drop = FALSE]
 
 message("Starting QC: ", nrow(counts), " genes, ", ncol(counts), " samples")
 
-# Library size QC
 dir.create("results/figures", recursive = TRUE, showWarnings = FALSE)
 
 qc_data <- data.frame(
@@ -25,57 +31,47 @@ qc_data <- data.frame(
 ggplot(qc_data, aes(condition, library_size)) +
   geom_boxplot(outlier.shape = NA, fill = "grey85") +
   geom_jitter(width = 0.15, alpha = 0.6) +
-  scale_y_log10(labels = scales::comma) +
+  scale_y_log10(labels = comma) +
   labs(title = "Library Size Distribution", y = "Total Reads", x = NULL) +
   theme_classic(base_size = 12)
 
 ggsave("results/figures/qc_library_size.png", width = 6, height = 5, dpi = 300)
 
-# Remove low-depth samples
 keep <- qc_data$library_size > 1e5
 counts <- counts[, keep, drop = FALSE]
 metadata <- metadata[keep, , drop = FALSE]
 
 message(sum(keep), " samples passed QC")
 
-# Balance groups (optional - for cleaner signal)
 set.seed(123)
 n <- 30
-
 pos_samples <- rownames(metadata)[metadata$condition == "positive"]
 neg_samples <- rownames(metadata)[metadata$condition == "negative"]
-
 balanced <- c(
   sample(pos_samples, min(n, length(pos_samples))),
   sample(neg_samples, min(n, length(neg_samples)))
 )
-
 counts <- counts[, balanced, drop = FALSE]
 metadata <- metadata[balanced, , drop = FALSE]
 
-# Clean gene IDs (remove version numbers)
 rownames(counts) <- sub("\\..*", "", rownames(counts))
 
-# Collapse duplicate genes
 if (any(duplicated(rownames(counts)))) {
   n_dup <- sum(duplicated(rownames(counts)))
   counts <- rowsum(counts, rownames(counts))
   message("Collapsed ", n_dup, " duplicates")
 }
 
-# Filter lowly expressed genes (CPM-based)
 keep_genes <- rowSums(cpm(counts) >= 1) >= 10
 counts <- counts[keep_genes, , drop = FALSE]
 
 message(nrow(counts), " genes retained after filtering")
 message("Final: ", sum(metadata$condition == "negative"), " neg, ",
         sum(metadata$condition == "positive"), " pos")
 
-# Save filtered data
 saveRDS(counts, "data/counts_clean.rds")
 saveRDS(metadata, "data/metadata_clean.rds")
 
-# Variance stabilization for PCA
 dds <- DESeqDataSetFromMatrix(counts, metadata, design = ~ condition)
 vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
 saveRDS(vsd, "data/vst_data.rds")
 
@@ -4,6 +4,10 @@
 library(DESeq2)
 library(ggplot2)
 
+if (!file.exists("data/vst_data.rds")) {
+  stop("File not found: data/vst_data.rds. Run scripts/01_qc.R first.")
+}
+
 vsd <- readRDS("data/vst_data.rds")
 
 pca_data <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
 
@@ -3,23 +3,26 @@
 
 library(DESeq2)
 
+if (!file.exists("data/counts_clean.rds")) {
+  stop("File not found: data/counts_clean.rds. Run scripts/01_qc.R first.")
+}
+if (!file.exists("data/metadata_clean.rds")) {
+  stop("File not found: data/metadata_clean.rds. Run scripts/01_qc.R first.")
+}
+
 counts <- readRDS("data/counts_clean.rds")
 metadata <- readRDS("data/metadata_clean.rds")
 
 message("Testing ", nrow(counts), " genes across ", ncol(counts), " samples")
 
-# Build DESeq2 dataset
 dds <- DESeqDataSetFromMatrix(counts, metadata, design = ~ condition)
 dds$condition <- relevel(dds$condition, ref = "negative")
 
-# Run DESeq2 pipeline
 dds <- DESeq(dds)
 
-# Extract results
 res <- results(dds, contrast = c("condition", "positive", "negative"), alpha = 0.05)
 res <- res[order(res$padj), ]
 
-# Summary
 n_sig <- sum(res$padj < 0.05, na.rm = TRUE)
 n_up <- sum(res$padj < 0.05 & res$log2FoldChange > 1, na.rm = TRUE)
 n_down <- sum(res$padj < 0.05 & res$log2FoldChange < -1, na.rm = TRUE)
@@ -28,11 +31,9 @@ message(n_sig, " significant genes (padj < 0.05)")
 message("  ", n_up, " upregulated (log2FC > 1)")
 message("  ", n_down, " downregulated (log2FC < -1)")
 
-# Save results
 res_df <- as.data.frame(res)
 res_df$gene <- rownames(res_df)
-res_df <- res_df[, c("gene", "baseMean", "log2FoldChange", "lfcSE", 
-                     "stat", "pvalue", "padj")]
+res_df <- res_df[, c("gene", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
 
 dir.create("results/tables", recursive = TRUE, showWarnings = FALSE)
 write.csv(res_df, "results/tables/deseq2_results.csv", row.names = FALSE)
 
@@ -5,9 +5,12 @@ library(ggplot2)
 library(ggrepel)
 library(dplyr)
 
+if (!file.exists("results/tables/deseq2_results.csv")) {
+  stop("File not found: results/tables/deseq2_results.csv. Run scripts/03_deseq2.R first.")
+}
+
 res <- read.csv("results/tables/deseq2_results.csv", stringsAsFactors = FALSE)
 
-# Classify genes
 res <- res %>%
   filter(!is.na(padj), !is.na(log2FoldChange)) %>%
   mutate(
@@ -19,21 +22,17 @@ res <- res %>%
     )
   )
 
-# Select top genes for labelling
 top_genes <- res %>%
   filter(padj < 0.001, abs(log2FoldChange) > 2) %>%
   arrange(padj) %>%
   head(10)
 
-message("Labelling ", nrow(top_genes), " genes: ", 
-        paste(top_genes$gene, collapse = ", "))
+message("Labelling ", nrow(top_genes), " genes: ", paste(top_genes$gene, collapse = ", "))
 
-# Plot
 ggplot(res, aes(log2FoldChange, -log10(padj))) +
   geom_point(aes(color = sig), alpha = 0.6, size = 1.8) +
   scale_color_manual(
-    values = c("Up" = "#e74c3c", "Down" = "#3498db", 
-               "Marginal" = "#95a5a6", "NS" = "grey80"),
+    values = c("Up" = "#e74c3c", "Down" = "#3498db", "Marginal" = "#95a5a6", "NS" = "grey80"),
     breaks = c("Up", "Down"),
     labels = c("Upregulated", "Downregulated")
   ) +
@@ -50,8 +49,8 @@ ggplot(res, aes(log2FoldChange, -log10(padj))) +
   geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "grey40") +
   labs(
     title = "Differential Expression in SARS-CoV-2 Infection",
-    x = "log₂ Fold Change",
-    y = "−log₁₀ Adjusted P-value",
+    x = "log2 Fold Change",
+    y = "-log10 Adjusted P-value",
     color = NULL
   ) +
   theme_classic(base_size = 13) +
@@ -63,7 +62,6 @@ ggplot(res, aes(log2FoldChange, -log10(padj))) +
 dir.create("results/figures", recursive = TRUE, showWarnings = FALSE)
 ggsave("results/figures/volcano_plot.png", width = 8, height = 7, dpi = 300)
 
-# Save top genes
 write.csv(
   top_genes %>% dplyr::select(gene, log2FoldChange, padj, baseMean),
   "results/tables/top_genes.csv",