tidyomicsHackathonTurku2026/beginners-guide.R at main · tidyomics/tidyomicsHackathonTurku2026 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
## Tidyomics Beginner's Guide -- May 2026
## Examples from https://github.com/tidyomics using real Bioconductor datasets
## See the tidyomics page for links to package vignettes and more tutorials

# Install tidyomics suite if needed:
# BiocManager::install("tidyomics")

# ============================================================
# 1. tidySummarizedExperiment — bulk RNA-seq (airway)
#    Comparison: tidy API vs. base R
# ============================================================

library(airway)
library(tidySummarizedExperiment)

data(airway)
airway

library(tidyprint)
airway |> print()

# Tidy approach: mean counts per gene across treated samples
airway |>
  filter(dex == "trt") |>
  group_by(gene_id) |>
  summarize(mean_count = mean(counts))

# Equivalent base R
treated <- airway[, airway$dex == "trt"]
data.frame(
  gene_id    = rownames(treated),
  mean_count = rowMeans(assay(treated, "counts"))
) |>
  head()

# Now just some basic demos of the capabilities of tidySummarizedExperiment:

# Derive a new colData column from existing metadata
airway |>
  mutate(treated = ifelse(dex == "trt", "yes", "no")) |>
  colData()

# Violin plot of log-expression for the first 1000 genes
subset_symb <- head(rowData(airway)$symbol, 1000)
airway |>
  mutate(log_counts = log10(counts + 1)) |>
  filter(symbol %in% subset_symb) |>
  ggplot(aes(dex, log_counts, fill = dex, group = .sample)) +
  geom_violin() +
  theme_bw()

# ============================================================
# 2. tidybulk — differential expression (airway)
# ============================================================

library(tidybulk)
library(ggplot2)
library(ggrepel)

filt <- airway |>
  identify_abundant(formula_design = ~dex) |>
  scale_abundance(method = "RLE")

de_airway <- filt |>
  test_differential_abundance(method = "DESeq2", .formula = ~cell + dex)

de_airway |>
  filter(padj < .01) |>
  ggplot(aes(log2FoldChange, -log10(padj))) +
  geom_point() +
  theme_bw() +
  coord_cartesian(xlim=c(-4,4),ylim=c(0,30))

# PCA to visualise sample separation by treatment
filt |>
  reduce_dimensions(method = "PCA") |>
  pivot_sample() |>
  ggplot(aes(PC1, PC2, color = dex, label = .sample)) +
  geom_point(size = 3) +
  geom_text_repel() +
  theme_bw()

# ============================================================
# 3. plyranges — ATAC-seq peaks overlapping promoters
#    Data: fluentGenomics workflow package
# ============================================================

library(plyranges)
library(fluentGenomics) # for the peaks
library(TxDb.Hsapiens.UCSC.hg38.knownGene) # for the genes
library(GenomeInfoDb) # for the genome

# ATAC-seq peaks, see fluentGenomics workflow for details
data(peaks)
peaks

# Build promoter windows from UCSC knownGene (GRCh38), standard chromosomes only
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
g <- genes(txdb) |>
  keepStandardChromosomes(pruning.mode = "coarse")

promoters_gr <- g |> promoters(upstream = 2000, downstream = 0)

# We could also perform this via plyranges
promoters_gr2 <- g |>
  anchor_5p() |>
  mutate(width=1) |>
  flank_upstream(width=2000)

all.equal(promoters_gr, promoters_gr2)

library(DFplyr) # for operating on the DataFrame at the end

# Tidy plyranges: count ATAC peaks per gene promoter:
# the flexibility here is that any statistic on the columns could be summarized
peaks |>
  join_overlap_inner(promoters_gr) |>
  group_by(gene_id) |>
  summarize(n_peaks = n()) |>
  pull(n_peaks) |>
  table()

# Alternative: count_overlaps() annotates each promoter directly.
# (magrittr pipe `%>%` is needed in the first line
# for the embedded placeholder `.` on the right side)
library(magrittr)
promoters_gr %>%
  mutate(n_peaks = count_overlaps(., peaks)) |>
  filter(n_peaks > 0) |>
  pull(n_peaks) |>
  table()