Skip to content

Commit 8e857bc

Browse files
EkinEkin
authored andcommitted
fix: harden pipeline checks
1 parent 7353f39 commit 8e857bc

4 files changed

Lines changed: 96 additions & 22 deletions

File tree

scripts/00_get_data.R

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,19 @@ if (!force_download && file.exists(counts_out) && file.exists(metadata_out)) {
6060
}
6161

6262
sample_ids <- sub(".*\\b(POS_\\d+|NEG_\\d+)\\b.*", "\\1", pheno$title)
63+
sample_ids[!grepl("^(POS|NEG)_\\d+$", sample_ids)] <- NA_character_
64+
65+
# Fall back to GEO accessions if title parsing fails for a subset of records.
66+
if ("geo_accession" %in% colnames(pheno)) {
67+
sample_ids[is.na(sample_ids)] <- pheno$geo_accession[is.na(sample_ids)]
68+
}
69+
70+
if (any(is.na(sample_ids))) {
71+
stop("Could not derive sample IDs for all records from GEO metadata.")
72+
}
73+
if (any(duplicated(sample_ids))) {
74+
stop("Duplicate sample IDs detected in GEO metadata.")
75+
}
6376

6477
metadata <- data.frame(
6578
sample_id = sample_ids,
@@ -71,9 +84,28 @@ if (!force_download && file.exists(counts_out) && file.exists(metadata_out)) {
7184
metadata$condition <- factor(metadata$condition, levels = c("negative", "positive"))
7285

7386
common <- intersect(colnames(raw_counts), rownames(metadata))
87+
if (length(common) == 0) {
88+
stop("No overlapping sample IDs between count matrix and metadata.")
89+
}
7490
raw_counts <- raw_counts[, common, drop = FALSE]
7591
metadata <- metadata[common, , drop = FALSE]
7692

93+
missing_condition <- is.na(metadata$condition)
94+
if (any(missing_condition)) {
95+
warning(
96+
sprintf(
97+
"Dropping %d samples with unknown condition labels after alignment.",
98+
sum(missing_condition)
99+
)
100+
)
101+
metadata <- metadata[!missing_condition, , drop = FALSE]
102+
raw_counts <- raw_counts[, rownames(metadata), drop = FALSE]
103+
}
104+
105+
if (ncol(raw_counts) == 0) {
106+
stop("No samples left after metadata alignment/condition filtering.")
107+
}
108+
77109
stopifnot(is.data.frame(metadata))
78110
stopifnot("condition" %in% colnames(metadata))
79111

scripts/01_qc.R

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@ counts <- readRDS("data/counts_raw.rds")
1717
metadata <- readRDS("data/metadata.rds")
1818

1919
metadata <- metadata[colnames(counts), , drop = FALSE]
20+
if (!all(colnames(counts) == rownames(metadata))) {
21+
stop("Counts and metadata are not aligned by sample ID.")
22+
}
23+
if (any(is.na(metadata$condition))) {
24+
stop("Metadata contains missing condition labels. Re-run scripts/00_get_data.R.")
25+
}
2026

2127
message("Starting QC: ", nrow(counts), " genes, ", ncol(counts), " samples")
2228

@@ -47,9 +53,18 @@ set.seed(123)
4753
n <- 30
4854
pos_samples <- rownames(metadata)[metadata$condition == "positive"]
4955
neg_samples <- rownames(metadata)[metadata$condition == "negative"]
56+
target_n <- min(n, length(pos_samples), length(neg_samples))
57+
58+
if (target_n == 0) {
59+
stop("At least one condition has zero samples after QC filtering.")
60+
}
61+
if (target_n < n) {
62+
message("Using ", target_n, " samples per condition (limited by available data).")
63+
}
64+
5065
balanced <- c(
51-
sample(pos_samples, min(n, length(pos_samples))),
52-
sample(neg_samples, min(n, length(neg_samples)))
66+
sample(pos_samples, target_n),
67+
sample(neg_samples, target_n)
5368
)
5469
counts <- counts[, balanced, drop = FALSE]
5570
metadata <- metadata[balanced, , drop = FALSE]
@@ -76,4 +91,4 @@ dds <- DESeqDataSetFromMatrix(counts, metadata, design = ~ condition)
7691
vsd <- varianceStabilizingTransformation(dds, blind = TRUE)
7792
saveRDS(vsd, "data/vst_data.rds")
7893

79-
message("QC complete")
94+
message("QC complete")

scripts/06_enrichment.R

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,15 @@ message("Running enrichment analysis...")
2121
sig_genes <- res_df %>%
2222
filter(padj < 0.05, abs(log2FoldChange) > 1) %>%
2323
pull(gene)
24+
sig_genes <- unique(sig_genes)
2425

2526
n_up <- sum(res_df$padj < 0.05 & res_df$log2FoldChange > 1, na.rm = TRUE)
2627
n_down <- sum(res_df$padj < 0.05 & res_df$log2FoldChange < -1, na.rm = TRUE)
2728

2829
message(length(sig_genes), " DE genes (", n_up, " up, ", n_down, " down)")
30+
if (length(sig_genes) == 0) {
31+
stop("No DE genes passed padj/log2FC thresholds; enrichment cannot proceed.")
32+
}
2933

3034
gene_map <- tryCatch({
3135
bitr(sig_genes, fromType = "ENSEMBL",
@@ -55,8 +59,8 @@ go_bp <- enrichGO(
5559
)
5660

5761
go_df <- as.data.frame(go_bp)
62+
write.csv(go_df, "results/tables/go_biological_process.csv", row.names = FALSE)
5863
if (nrow(go_df) > 0) {
59-
write.csv(go_df, "results/tables/go_biological_process.csv", row.names = FALSE)
6064
message(" ", nrow(go_df), " GO terms enriched")
6165

6266
p1 <- dotplot(go_bp, showCategory = 20, font.size = 9) +
@@ -65,26 +69,35 @@ if (nrow(go_df) > 0) {
6569
ggsave("results/figures/go_dotplot.png", p1, width = 10, height = 9, dpi = 300)
6670
}
6771

68-
kegg <- enrichKEGG(
69-
gene = gene_map$ENTREZID,
70-
organism = "hsa",
71-
pAdjustMethod = "BH",
72-
pvalueCutoff = 0.05,
73-
qvalueCutoff = 0.1
72+
kegg <- tryCatch(
73+
enrichKEGG(
74+
gene = gene_map$ENTREZID,
75+
organism = "hsa",
76+
pAdjustMethod = "BH",
77+
pvalueCutoff = 0.05,
78+
qvalueCutoff = 0.1
79+
),
80+
error = function(e) {
81+
warning("KEGG enrichment unavailable (network/service issue): ", conditionMessage(e))
82+
NULL
83+
}
7484
)
7585

76-
kegg_df <- as.data.frame(kegg)
77-
if (nrow(kegg_df) > 0) {
78-
kegg_read <- setReadable(kegg, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
79-
kegg_df <- as.data.frame(kegg_read)
80-
write.csv(kegg_df, "results/tables/kegg_pathways.csv", row.names = FALSE)
81-
message(" ", nrow(kegg_df), " KEGG pathways enriched")
82-
83-
p2 <- dotplot(kegg_read, showCategory = 20, font.size = 9) +
84-
ggtitle("KEGG Pathway Enrichment") +
85-
theme(plot.title = element_text(face = "bold", hjust = 0.5))
86-
ggsave("results/figures/kegg_dotplot.png", p2, width = 10, height = 9, dpi = 300)
86+
kegg_df <- data.frame()
87+
if (!is.null(kegg)) {
88+
kegg_df <- as.data.frame(kegg)
89+
if (nrow(kegg_df) > 0) {
90+
kegg_read <- setReadable(kegg, OrgDb = org.Hs.eg.db, keyType = "ENTREZID")
91+
kegg_df <- as.data.frame(kegg_read)
92+
message(" ", nrow(kegg_df), " KEGG pathways enriched")
93+
94+
p2 <- dotplot(kegg_read, showCategory = 20, font.size = 9) +
95+
ggtitle("KEGG Pathway Enrichment") +
96+
theme(plot.title = element_text(face = "bold", hjust = 0.5))
97+
ggsave("results/figures/kegg_dotplot.png", p2, width = 10, height = 9, dpi = 300)
98+
}
8799
}
100+
write.csv(kegg_df, "results/tables/kegg_pathways.csv", row.names = FALSE)
88101

89102
if (nrow(go_df) > 0) {
90103
message("\nTop GO terms:")
@@ -96,4 +109,4 @@ if (nrow(kegg_df) > 0) {
96109
print(head(kegg_df[, c("Description", "p.adjust", "Count")], 5), row.names = FALSE)
97110
}
98111

99-
message("\nEnrichment analysis complete")
112+
message("\nEnrichment analysis complete")

tests/testthat/test-smoke.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ test_that("DESeq2 results CSV exists and is well-formed", {
2727
expect_gt(nrow(res), 0)
2828
expect_true(any(!is.na(res$padj)))
2929
expect_true(all(nchar(res$gene) > 0))
30+
31+
pvals <- res$pvalue[!is.na(res$pvalue)]
32+
expect_true(length(pvals) > 0)
33+
expect_true(all(pvals >= 0 & pvals <= 1))
3034
})
3135

3236
test_that("key figures exist", {
@@ -36,3 +40,13 @@ test_that("key figures exist", {
3640
expect_true(file.exists(volcano))
3741
expect_true(file.exists(pca))
3842
})
43+
44+
test_that("key derived tables exist", {
45+
top_genes <- file.path(root, "results/tables/top_genes.csv")
46+
go_terms <- file.path(root, "results/tables/go_biological_process.csv")
47+
kegg_terms <- file.path(root, "results/tables/kegg_pathways.csv")
48+
49+
expect_true(file.exists(top_genes))
50+
expect_true(file.exists(go_terms))
51+
expect_true(file.exists(kegg_terms))
52+
})

0 commit comments

Comments
 (0)