Skip to content

Commit 6fa52a0

Browse files
EkinEkin
authored andcommitted
Add dependency installer for portability
1 parent 5ffc734 commit 6fa52a0

14 files changed

Lines changed: 157 additions & 84 deletions
-7 Bytes
Loading
-1.9 KB
Loading

results/figures/volcano_plot.png

1.1 KB
Loading

scripts/000_install_dependencies.R

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/usr/bin/env Rscript
2+
# Install all required packages for the analysis pipeline
3+
4+
message("Checking and installing required packages...\n")
5+
6+
# CRAN packages
7+
cran_pkgs <- c("ggplot2", "dplyr", "pheatmap", "RColorBrewer", "ggrepel", "scales")
8+
9+
for (pkg in cran_pkgs) {
10+
if (!requireNamespace(pkg, quietly = TRUE)) {
11+
message("Installing ", pkg, "...")
12+
install.packages(pkg, repos = "https://cloud.r-project.org")
13+
}
14+
}
15+
16+
# Bioconductor
17+
if (!requireNamespace("BiocManager", quietly = TRUE)) {
18+
install.packages("BiocManager", repos = "https://cloud.r-project.org")
19+
}
20+
21+
bioc_pkgs <- c("DESeq2", "edgeR", "GEOquery", "clusterProfiler", "org.Hs.eg.db", "enrichplot")
22+
23+
for (pkg in bioc_pkgs) {
24+
if (!requireNamespace(pkg, quietly = TRUE)) {
25+
message("Installing ", pkg, "...")
26+
BiocManager::install(pkg, update = FALSE, ask = FALSE)
27+
}
28+
}
29+
30+
message("\nAll packages installed. Run: source('run_all.R')")

scripts/00_get_data.R

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,75 +6,67 @@ library(GEOquery)
66

77
GEO_ID <- "GSE152075"
88

9-
# Setup directories
109
dir.create("data/raw", recursive = TRUE, showWarnings = FALSE)
1110
dir.create("data", recursive = TRUE, showWarnings = FALSE)
1211

1312
message("Downloading ", GEO_ID)
1413

15-
# Download supplementary files
1614
getGEOSuppFiles(GEO_ID, makeDirectory = TRUE, baseDir = "data/raw")
1715

18-
# Extract archives
1916
supp_dir <- file.path("data/raw", GEO_ID)
2017
tar_files <- list.files(supp_dir, pattern = "\\.tar$", full.names = TRUE)
2118
invisible(lapply(tar_files, untar, exdir = supp_dir))
2219

23-
# Find count matrix
2420
files <- list.files(supp_dir, recursive = TRUE, full.names = TRUE)
2521
count_file <- files[grepl("count.*\\.txt", basename(files), ignore.case = TRUE)][1]
2622

2723
if (is.na(count_file)) stop("Count matrix not found")
2824

2925
message("Reading: ", basename(count_file))
3026

31-
# Load counts with gene IDs as rownames
32-
raw_counts <- read.table(count_file, header = TRUE, row.names = 1,
27+
raw_counts <- read.table(count_file, header = TRUE, row.names = 1,
3328
check.names = FALSE, stringsAsFactors = FALSE)
3429
raw_counts <- as.matrix(raw_counts)
3530
storage.mode(raw_counts) <- "integer"
3631
raw_counts <- raw_counts[complete.cases(raw_counts), ]
3732

3833
message(nrow(raw_counts), " genes × ", ncol(raw_counts), " samples")
3934

40-
# Get sample metadata
4135
gse <- getGEO(GEO_ID, GSEMatrix = TRUE)
4236
pheno <- pData(gse[[1]])
4337

44-
# Parse SARS-CoV-2 status from characteristics
4538
positivity <- pheno$characteristics_ch1
4639
condition <- ifelse(
4740
grepl("positivity:\\s*pos", positivity, ignore.case = TRUE), "positive",
4841
ifelse(grepl("positivity:\\s*neg", positivity, ignore.case = TRUE), "negative", NA)
4942
)
5043

51-
# Extract sample IDs (POS_### or NEG_###) from titles
44+
n_na <- sum(is.na(condition))
45+
if (n_na > 0) {
46+
warning(sprintf("%d samples have unknown condition; check GEO metadata format", n_na))
47+
}
48+
5249
sample_ids <- sub(".*\\b(POS_\\d+|NEG_\\d+)\\b.*", "\\1", pheno$title)
5350

54-
# Build metadata - ensure it stays a dataframe
5551
metadata <- data.frame(
5652
sample_id = sample_ids,
5753
condition = condition,
5854
row.names = sample_ids,
5955
stringsAsFactors = FALSE
6056
)
6157

62-
# Convert condition to factor
6358
metadata$condition <- factor(metadata$condition, levels = c("negative", "positive"))
6459

65-
# Align counts with metadata
6660
common <- intersect(colnames(raw_counts), rownames(metadata))
6761
raw_counts <- raw_counts[, common, drop = FALSE]
6862
metadata <- metadata[common, , drop = FALSE]
6963

70-
# Verify structure
7164
stopifnot(is.data.frame(metadata))
7265
stopifnot("condition" %in% colnames(metadata))
7366

7467
message(sum(metadata$condition == "negative"), " negative, ",
7568
sum(metadata$condition == "positive"), " positive")
7669

77-
# Save
7870
saveRDS(raw_counts, "data/counts_raw.rds")
7971
saveRDS(metadata, "data/metadata.rds")
8072

scripts/01_qc.R

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,22 @@
44
library(DESeq2)
55
library(edgeR)
66
library(ggplot2)
7+
library(scales)
8+
9+
if (!file.exists("data/counts_raw.rds")) {
10+
stop("File not found: data/counts_raw.rds. Run scripts/00_get_data.R first.")
11+
}
12+
if (!file.exists("data/metadata.rds")) {
13+
stop("File not found: data/metadata.rds. Run scripts/00_get_data.R first.")
14+
}
715

816
counts <- readRDS("data/counts_raw.rds")
917
metadata <- readRDS("data/metadata.rds")
1018

11-
# Align samples
1219
metadata <- metadata[colnames(counts), , drop = FALSE]
1320

1421
message("Starting QC: ", nrow(counts), " genes, ", ncol(counts), " samples")
1522

16-
# Library size QC
1723
dir.create("results/figures", recursive = TRUE, showWarnings = FALSE)
1824

1925
qc_data <- data.frame(
@@ -25,57 +31,47 @@ qc_data <- data.frame(
2531
ggplot(qc_data, aes(condition, library_size)) +
2632
geom_boxplot(outlier.shape = NA, fill = "grey85") +
2733
geom_jitter(width = 0.15, alpha = 0.6) +
28-
scale_y_log10(labels = scales::comma) +
34+
scale_y_log10(labels = comma) +
2935
labs(title = "Library Size Distribution", y = "Total Reads", x = NULL) +
3036
theme_classic(base_size = 12)
3137

3238
ggsave("results/figures/qc_library_size.png", width = 6, height = 5, dpi = 300)
3339

34-
# Remove low-depth samples
3540
keep <- qc_data$library_size > 1e5
3641
counts <- counts[, keep, drop = FALSE]
3742
metadata <- metadata[keep, , drop = FALSE]
3843

3944
message(sum(keep), " samples passed QC")
4045

41-
# Balance groups (optional - for cleaner signal)
4246
set.seed(123)
4347
n <- 30
44-
4548
pos_samples <- rownames(metadata)[metadata$condition == "positive"]
4649
neg_samples <- rownames(metadata)[metadata$condition == "negative"]
47-
4850
balanced <- c(
4951
sample(pos_samples, min(n, length(pos_samples))),
5052
sample(neg_samples, min(n, length(neg_samples)))
5153
)
52-
5354
counts <- counts[, balanced, drop = FALSE]
5455
metadata <- metadata[balanced, , drop = FALSE]
5556

56-
# Clean gene IDs (remove version numbers)
5757
rownames(counts) <- sub("\\..*", "", rownames(counts))
5858

59-
# Collapse duplicate genes
6059
if (any(duplicated(rownames(counts)))) {
6160
n_dup <- sum(duplicated(rownames(counts)))
6261
counts <- rowsum(counts, rownames(counts))
6362
message("Collapsed ", n_dup, " duplicates")
6463
}
6564

66-
# Filter lowly expressed genes (CPM-based)
6765
keep_genes <- rowSums(cpm(counts) >= 1) >= 10
6866
counts <- counts[keep_genes, , drop = FALSE]
6967

7068
message(nrow(counts), " genes retained after filtering")
7169
message("Final: ", sum(metadata$condition == "negative"), " neg, ",
7270
sum(metadata$condition == "positive"), " pos")
7371

74-
# Save filtered data
7572
saveRDS(counts, "data/counts_clean.rds")
7673
saveRDS(metadata, "data/metadata_clean.rds")
7774

78-
# Variance stabilization for PCA
7975
dds <- DESeqDataSetFromMatrix(counts, metadata, design = ~ condition)
8076
vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
8177
saveRDS(vsd, "data/vst_data.rds")

scripts/02_pca.R

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
library(DESeq2)
55
library(ggplot2)
66

7+
if (!file.exists("data/vst_data.rds")) {
8+
stop("File not found: data/vst_data.rds. Run scripts/01_qc.R first.")
9+
}
10+
711
vsd <- readRDS("data/vst_data.rds")
812

913
pca_data <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)

scripts/03_deseq2.R

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,26 @@
33

44
library(DESeq2)
55

6+
if (!file.exists("data/counts_clean.rds")) {
7+
stop("File not found: data/counts_clean.rds. Run scripts/01_qc.R first.")
8+
}
9+
if (!file.exists("data/metadata_clean.rds")) {
10+
stop("File not found: data/metadata_clean.rds. Run scripts/01_qc.R first.")
11+
}
12+
613
counts <- readRDS("data/counts_clean.rds")
714
metadata <- readRDS("data/metadata_clean.rds")
815

916
message("Testing ", nrow(counts), " genes across ", ncol(counts), " samples")
1017

11-
# Build DESeq2 dataset
1218
dds <- DESeqDataSetFromMatrix(counts, metadata, design = ~ condition)
1319
dds$condition <- relevel(dds$condition, ref = "negative")
1420

15-
# Run DESeq2 pipeline
1621
dds <- DESeq(dds)
1722

18-
# Extract results
1923
res <- results(dds, contrast = c("condition", "positive", "negative"), alpha = 0.05)
2024
res <- res[order(res$padj), ]
2125

22-
# Summary
2326
n_sig <- sum(res$padj < 0.05, na.rm = TRUE)
2427
n_up <- sum(res$padj < 0.05 & res$log2FoldChange > 1, na.rm = TRUE)
2528
n_down <- sum(res$padj < 0.05 & res$log2FoldChange < -1, na.rm = TRUE)
@@ -28,11 +31,9 @@ message(n_sig, " significant genes (padj < 0.05)")
2831
message(" ", n_up, " upregulated (log2FC > 1)")
2932
message(" ", n_down, " downregulated (log2FC < -1)")
3033

31-
# Save results
3234
res_df <- as.data.frame(res)
3335
res_df$gene <- rownames(res_df)
34-
res_df <- res_df[, c("gene", "baseMean", "log2FoldChange", "lfcSE",
35-
"stat", "pvalue", "padj")]
36+
res_df <- res_df[, c("gene", "baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
3637

3738
dir.create("results/tables", recursive = TRUE, showWarnings = FALSE)
3839
write.csv(res_df, "results/tables/deseq2_results.csv", row.names = FALSE)

scripts/04_visualisation_volcano.R

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,12 @@ library(ggplot2)
55
library(ggrepel)
66
library(dplyr)
77

8+
if (!file.exists("results/tables/deseq2_results.csv")) {
9+
stop("File not found: results/tables/deseq2_results.csv. Run scripts/03_deseq2.R first.")
10+
}
11+
812
res <- read.csv("results/tables/deseq2_results.csv", stringsAsFactors = FALSE)
913

10-
# Classify genes
1114
res <- res %>%
1215
filter(!is.na(padj), !is.na(log2FoldChange)) %>%
1316
mutate(
@@ -19,21 +22,17 @@ res <- res %>%
1922
)
2023
)
2124

22-
# Select top genes for labelling
2325
top_genes <- res %>%
2426
filter(padj < 0.001, abs(log2FoldChange) > 2) %>%
2527
arrange(padj) %>%
2628
head(10)
2729

28-
message("Labelling ", nrow(top_genes), " genes: ",
29-
paste(top_genes$gene, collapse = ", "))
30+
message("Labelling ", nrow(top_genes), " genes: ", paste(top_genes$gene, collapse = ", "))
3031

31-
# Plot
3232
ggplot(res, aes(log2FoldChange, -log10(padj))) +
3333
geom_point(aes(color = sig), alpha = 0.6, size = 1.8) +
3434
scale_color_manual(
35-
values = c("Up" = "#e74c3c", "Down" = "#3498db",
36-
"Marginal" = "#95a5a6", "NS" = "grey80"),
35+
values = c("Up" = "#e74c3c", "Down" = "#3498db", "Marginal" = "#95a5a6", "NS" = "grey80"),
3736
breaks = c("Up", "Down"),
3837
labels = c("Upregulated", "Downregulated")
3938
) +
@@ -50,8 +49,8 @@ ggplot(res, aes(log2FoldChange, -log10(padj))) +
5049
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "grey40") +
5150
labs(
5251
title = "Differential Expression in SARS-CoV-2 Infection",
53-
x = "log₂ Fold Change",
54-
y = "−log₁₀ Adjusted P-value",
52+
x = "log2 Fold Change",
53+
y = "-log10 Adjusted P-value",
5554
color = NULL
5655
) +
5756
theme_classic(base_size = 13) +
@@ -63,7 +62,6 @@ ggplot(res, aes(log2FoldChange, -log10(padj))) +
6362
dir.create("results/figures", recursive = TRUE, showWarnings = FALSE)
6463
ggsave("results/figures/volcano_plot.png", width = 8, height = 7, dpi = 300)
6564

66-
# Save top genes
6765
write.csv(
6866
top_genes %>% dplyr::select(gene, log2FoldChange, padj, baseMean),
6967
"results/tables/top_genes.csv",

0 commit comments

Comments
 (0)