Skip to content

Commit 14667ae

Browse files
committed
Update to MSigDB 2022.1
1 parent 04e5dc2 commit 14667ae

File tree

7 files changed

+112
-76
lines changed

7 files changed

+112
-76
lines changed

DESCRIPTION

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Type: Package
22
Package: msigdbr
33
Title: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format
4-
Version: 7.5.1.9001
4+
Version: 2022.1.1
55
Authors@R:
66
person("Igor", "Dolgalev", , "[email protected]", role = c("aut", "cre"),
77
comment = c(ORCID = "0000-0003-4451-126X"))
@@ -19,7 +19,7 @@ BugReports: https://github.com/igordot/msigdbr/issues
1919
Depends:
2020
R (>= 3.4)
2121
Imports:
22-
babelgene,
22+
babelgene (>= 22.9),
2323
dplyr (>= 0.7.0),
2424
magrittr,
2525
rlang (>= 0.4.11),

NEWS.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# msigdbr 2022.1.1
2+
3+
* Based on MSigDB v2022.1 release.
4+
* Not on CRAN.
5+
16
# msigdbr 7.5.1
27

38
* Based on MSigDB v7.5.1 release.

R/sysdata.rda

-105 KB
Binary file not shown.

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
[![CRAN](https://www.r-pkg.org/badges/version/msigdbr)](https://cran.r-project.org/package=msigdbr)
44
[![CRAN downloads](https://cranlogs.r-pkg.org/badges/last-month/msigdbr)](https://cran.r-project.org/package=msigdbr)
55
[![R-CMD-check](https://github.com/igordot/msigdbr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/igordot/msigdbr/actions/workflows/R-CMD-check.yaml)
6-
[![codecov](https://codecov.io/gh/igordot/msigdbr/branch/master/graph/badge.svg?token=nYFPe9mc0Q)](https://codecov.io/gh/igordot/msigdbr)
6+
[![codecov](https://codecov.io/gh/igordot/msigdbr/branch/master/graph/badge.svg?token=nYFPe9mc0Q)](https://app.codecov.io/gh/igordot/msigdbr)
77

88
## Overview
99

data-raw/msigdbr-prepare.R

+94-66
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,19 @@ options(pillar.print_max = 100)
1111

1212
# Import MSigDB gene sets -----
1313

14-
# Define MSigDB download variables
14+
# Set MSigDB version
1515
mdb_version <- "2022.1.Hs"
16+
17+
# Set HGNC version (last quarterly release before MSigDB release)
18+
hgnc_version <- "2022-07-01"
19+
20+
# Set MSigDB file paths
1621
mdb_xml <- glue("msigdb_v{mdb_version}.xml")
1722
mdb_url_base <- "https://data.broadinstitute.org/gsea-msigdb/msigdb"
1823
mdb_xml_url <- glue("{mdb_url_base}/release/{mdb_version}/{mdb_xml}")
1924

2025
# Download the MSigDB XML file
21-
options(timeout = 150)
26+
options(timeout = 300)
2227
download.file(url = mdb_xml_url, destfile = mdb_xml)
2328

2429
# Check MSigDB XML file size in bytes
@@ -64,17 +69,41 @@ mdb_tbl <-
6469
filter(gs_cat != "ARCHIVED")
6570

6671
# Get the number of gene sets per collection (for testing)
67-
msigdb_category_genesets <- mdb_tbl %>%
72+
mdb_category_genesets <- mdb_tbl %>%
6873
distinct(gs_cat, gs_subcat, gs_id) %>%
6974
count(gs_cat, gs_subcat, name = "n_genesets")
70-
msigdb_category_genesets
75+
mdb_category_genesets
7176

7277
# Import MSigDB Ensembl mappings -----
7378

74-
# Download the MSigDB Ensembl mappings
79+
# Download MSigDB Ensembl mappings
80+
# Should include all MSigDB genes
7581
ensembl_url <- glue("{mdb_url_base}/annotations/human/Human_Ensembl_Gene_ID_MSigDB.v{mdb_version}.chip")
7682
ensembl_tbl <- read_tsv(ensembl_url, progress = FALSE, show_col_types = FALSE)
77-
ensembl_tbl <- ensembl_tbl %>% select(human_ensembl_gene = `Probe Set ID`, human_gene_symbol = `Gene Symbol`)
83+
ensembl_tbl <- distinct(ensembl_tbl, human_ensembl_gene = `Probe Set ID`, human_gene_symbol = `Gene Symbol`)
84+
ensembl_tbl <- arrange(ensembl_tbl, human_ensembl_gene)
85+
86+
# Check for multi-mappers (should be many)
87+
count(ensembl_tbl, human_ensembl_gene, sort = TRUE)
88+
count(ensembl_tbl, human_gene_symbol, sort = TRUE)
89+
90+
# Import HGNC mappings -----
91+
92+
# Download HGNC mappings
93+
# May not include all MSigDB genes, but there is usually one Ensembl ID per gene
94+
hgnc_url <- glue("https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/hgnc_complete_set_{hgnc_version}.txt")
95+
hgnc_tbl <- read_tsv(hgnc_url, progress = FALSE, show_col_types = FALSE, guess_max = 10000)
96+
hgnc_tbl <- distinct(hgnc_tbl, human_ensembl_gene = ensembl_gene_id, human_entrez_gene = entrez_id)
97+
hgnc_tbl <- mutate(hgnc_tbl, human_entrez_gene = as.integer(human_entrez_gene))
98+
99+
# Keep only MSigDB Ensembl IDs
100+
setdiff(hgnc_tbl$human_ensembl_gene, ensembl_tbl$human_ensembl_gene) %>% length()
101+
hgnc_tbl <- filter(hgnc_tbl, human_ensembl_gene %in% ensembl_tbl$human_ensembl_gene)
102+
hgnc_tbl <- arrange(hgnc_tbl, human_ensembl_gene)
103+
104+
# Check for multi-mappers (should be few)
105+
count(hgnc_tbl, human_ensembl_gene, sort = TRUE)
106+
count(hgnc_tbl, human_entrez_gene, sort = TRUE)
78107

79108
# Generate a gene sets table -----
80109

@@ -84,7 +113,7 @@ msigdbr_genesets <- mdb_tbl %>%
84113
distinct() %>%
85114
arrange(gs_name, gs_id)
86115

87-
if (nrow(msigdbr_genesets) != sum(msigdb_category_genesets$n_genesets)) stop()
116+
if (nrow(msigdbr_genesets) != sum(mdb_category_genesets$n_genesets)) stop()
88117

89118
# Extract gene set members -----
90119

@@ -98,7 +127,7 @@ nrow(geneset_genes) %>% prettyNum(big.mark = ",")
98127
geneset_genes <- filter(geneset_genes, str_detect(gs_members_split, fixed(",")))
99128
nrow(geneset_genes) %>% prettyNum(big.mark = ",")
100129

101-
# Split gene details into columns
130+
# Split member details into separate columns
102131
geneset_genes <- geneset_genes %>%
103132
separate(
104133
col = gs_members_split,
@@ -109,21 +138,17 @@ geneset_genes <- geneset_genes %>%
109138
nrow(geneset_genes) %>% prettyNum(big.mark = ",")
110139

111140
# Check for any strange patterns
112-
geneset_genes %>%
113-
count(source_gene, sort = TRUE) %>%
114-
head(10)
115-
geneset_genes %>%
116-
count(human_gene_symbol, human_entrez_gene, sort = TRUE) %>%
117-
head(10)
141+
count(geneset_genes, source_gene, sort = TRUE)
142+
count(geneset_genes, human_gene_symbol, human_entrez_gene, sort = TRUE)
118143

119144
# Get the number of members per gene set (for testing)
120145
# Not all members map to unique genes
121-
msigdb_geneset_members <- geneset_genes %>% count(gs_id, name = "n_members")
122-
msigdb_geneset_members
146+
mdb_geneset_members <- geneset_genes %>% count(gs_id, name = "n_members")
147+
mdb_geneset_members
123148

124149
# Confirm that gene set sizes are reasonable
125-
if (min(msigdb_geneset_members$n_members) < 5) stop()
126-
if (max(msigdb_geneset_members$n_members) > 3000) stop()
150+
if (min(mdb_geneset_members$n_members) < 5) stop()
151+
if (max(mdb_geneset_members$n_members) > 3000) stop()
127152
if (min(geneset_genes$human_entrez_gene, na.rm = TRUE) < 1) stop()
128153

129154
# Skip genes without an Entrez or Ensembl ID
@@ -136,86 +161,91 @@ geneset_genes <- geneset_genes %>%
136161
distinct(gs_id, source_gene, human_entrez_gene, human_gene_symbol)
137162
nrow(geneset_genes) %>% prettyNum(big.mark = ",")
138163

139-
# Generate gene IDs -----
164+
# Add Ensembl IDs to genes without them -----
140165

141166
# Split genes based on if they include Ensembl IDs
142167
# Starting with MSigDB 7.0, Ensembl is the platform annotation authority
143168
# Add internal gene ID to track both Entrez and Ensembl genes
144169
# Using Ensembl IDs as IDs for all genes resulted in a larger data file
145170
geneset_genes_entrez <- geneset_genes %>%
146171
filter(str_detect(source_gene, "^ENSG000", negate = TRUE)) %>%
147-
distinct(gs_id, human_entrez_gene, human_gene_symbol) %>%
148-
mutate(gene_id = human_entrez_gene) %>%
149-
arrange(gs_id, gene_id)
172+
distinct(gs_id, human_entrez_gene, human_gene_symbol)
150173
geneset_genes_ensembl <- geneset_genes %>%
151174
filter(str_detect(source_gene, "^ENSG000")) %>%
152175
select(gs_id, human_entrez_gene, human_ensembl_gene = source_gene, human_gene_symbol) %>%
153-
mutate(human_gene_symbol = if_else(human_gene_symbol == "", human_ensembl_gene, human_gene_symbol)) %>%
154-
mutate(gene_id = str_replace(human_ensembl_gene, "ENSG000", "9")) %>%
155-
mutate(gene_id = as.integer(gene_id)) %>%
156-
arrange(gs_id, gene_id)
176+
mutate(human_gene_symbol = if_else(human_gene_symbol == "", human_ensembl_gene, human_gene_symbol))
157177

158-
# Check that the gene IDs are distinct for Entrez and Ensembl tables
159-
intersect(geneset_genes_entrez$gene_id, geneset_genes_ensembl$gene_id)
160-
161-
# Most gene sets should not have only some source genes as Ensembl IDs
178+
# Very few gene sets should have only some source genes as Ensembl IDs
162179
intersect(geneset_genes_entrez$gs_id, geneset_genes_ensembl$gs_id)
163180

164-
# Determine unambiguous genes with only one Entrez and Ensembl ID
165-
clean_entrez_genes <- geneset_genes_ensembl %>%
166-
distinct(human_entrez_gene, human_gene_symbol, human_ensembl_gene) %>%
167-
count(human_entrez_gene) %>%
168-
filter(n == 1) %>%
169-
pull(human_entrez_gene)
170-
length(clean_entrez_genes)
171-
172-
# Use the Entrez ID for unambiguous genes
173-
geneset_genes_ensembl <- geneset_genes_ensembl %>%
174-
mutate(gene_id = if_else(human_entrez_gene %in% clean_entrez_genes, human_entrez_gene, gene_id)) %>%
175-
arrange(gs_id, gene_id)
176-
177181
# Check the number of genes
178-
nrow(geneset_genes_entrez)
179-
n_distinct(geneset_genes_entrez$gene_id)
182+
nrow(geneset_genes_entrez) %>% prettyNum(big.mark = ",")
180183
n_distinct(geneset_genes_entrez$human_gene_symbol)
181184
n_distinct(geneset_genes_entrez$human_entrez_gene)
182-
nrow(geneset_genes_ensembl)
183-
n_distinct(geneset_genes_ensembl$gene_id)
185+
nrow(geneset_genes_ensembl) %>% prettyNum(big.mark = ",")
184186
n_distinct(geneset_genes_ensembl$human_gene_symbol)
185187
n_distinct(geneset_genes_ensembl$human_ensembl_gene)
186188

187189
if (length(setdiff(geneset_genes_entrez$human_gene_symbol, ensembl_tbl$human_gene_symbol))) stop()
188190

191+
# Further split genes without Ensembl IDs based on HGNC Ensembl IDs
192+
geneset_genes_entrez_hgnc <- geneset_genes_entrez %>%
193+
filter(human_entrez_gene %in% hgnc_tbl$human_entrez_gene)
194+
geneset_genes_entrez_ensembl <- geneset_genes_entrez %>%
195+
filter(!human_entrez_gene %in% hgnc_tbl$human_entrez_gene)
196+
189197
# Add Ensembl IDs to genes without them
190-
geneset_genes_entrez <- left_join(geneset_genes_entrez, ensembl_tbl, by = "human_gene_symbol")
198+
geneset_genes_entrez_hgnc <- left_join(geneset_genes_entrez_hgnc, hgnc_tbl, by = "human_entrez_gene")
199+
geneset_genes_entrez_ensembl <- left_join(geneset_genes_entrez_ensembl, ensembl_tbl, by = "human_gene_symbol")
191200

192-
# Check gene numbers
193-
nrow(geneset_genes_entrez)
194-
n_distinct(geneset_genes_entrez$human_entrez_gene)
195-
n_distinct(geneset_genes_entrez$human_gene_symbol)
196-
n_distinct(geneset_genes_entrez$human_ensembl_gene)
201+
# Check the number of genes
202+
nrow(geneset_genes_entrez_hgnc) %>% prettyNum(big.mark = ",")
203+
n_distinct(geneset_genes_entrez_hgnc$human_entrez_gene)
204+
n_distinct(geneset_genes_entrez_hgnc$human_ensembl_gene)
205+
nrow(geneset_genes_entrez_ensembl) %>% prettyNum(big.mark = ",")
206+
n_distinct(geneset_genes_entrez_ensembl$human_entrez_gene)
207+
n_distinct(geneset_genes_entrez_ensembl$human_ensembl_gene)
208+
209+
# Combine different types of genes into a single table
210+
geneset_genes_clean <-
211+
bind_rows(geneset_genes_entrez_hgnc, geneset_genes_entrez_ensembl, geneset_genes_ensembl) %>%
212+
mutate(gene_id = str_remove(human_ensembl_gene, "ENSG000")) %>%
213+
mutate(gene_id = as.integer(gene_id)) %>%
214+
distinct() %>%
215+
arrange(gs_id, gene_id)
216+
nrow(geneset_genes_clean) %>% prettyNum(big.mark = ",")
217+
218+
# Make internal IDs consecutive
219+
geneset_genes_clean$gene_id <- dense_rank(geneset_genes_clean$gene_id)
220+
geneset_genes_clean %>%
221+
count(human_gene_symbol, gene_id) %>%
222+
arrange(human_gene_symbol)
223+
geneset_genes_clean %>%
224+
count(human_ensembl_gene, gene_id) %>%
225+
arrange(human_ensembl_gene)
197226

198227
# Generate a gene set members table -----
199228

200229
# Combine Entrez and Ensembl genes into a single table
201-
msigdbr_geneset_genes <-
202-
bind_rows(geneset_genes_entrez, geneset_genes_ensembl) %>%
230+
msigdbr_geneset_genes <- geneset_genes_clean %>%
203231
distinct(gs_id, gene_id) %>%
204232
arrange(gs_id, gene_id)
205233

206-
# Check the total number of gene set members
234+
# Check gene numbers
235+
nrow(geneset_genes) %>% prettyNum(big.mark = ",")
207236
nrow(msigdbr_geneset_genes) %>% prettyNum(big.mark = ",")
208237

209238
# Check that all the original gene sets are present
210-
if (length(setdiff(msigdb_geneset_members$gs_id, msigdbr_geneset_genes$gs_id)) > 0) stop()
239+
if (length(setdiff(mdb_geneset_members$gs_id, msigdbr_geneset_genes$gs_id)) > 0) stop()
211240

212241
# Check that most of the original gene set members converted to genes
213-
if (nrow(msigdbr_geneset_genes) < (sum(msigdb_geneset_members$n_members) * 0.85)) stop()
214-
genes_members_ratio = full_join(msigdb_geneset_members, count(msigdbr_geneset_genes, gs_id, name = "n_genes"), by = "gs_id")
215-
genes_members_ratio$ratio = genes_members_ratio$n_genes / genes_members_ratio$n_members
242+
if (nrow(msigdbr_geneset_genes) < (sum(mdb_geneset_members$n_members) * 0.85)) stop()
243+
genes_members_ratio <- full_join(mdb_geneset_members, count(msigdbr_geneset_genes, gs_id, name = "n_genes"), by = "gs_id")
244+
genes_members_ratio$ratio <- genes_members_ratio$n_genes / genes_members_ratio$n_members
216245
if (min(genes_members_ratio$n_genes) < 5) stop()
217246
if (max(genes_members_ratio$n_genes) > 2300) stop()
218-
if (max(genes_members_ratio$ratio) > 1) stop()
247+
if (max(genes_members_ratio$ratio) > 2) stop()
248+
if (quantile(genes_members_ratio$ratio, 0.99) > 1) stop()
219249
if (quantile(genes_members_ratio$ratio, 0.001) < 0.3) stop()
220250
if (quantile(genes_members_ratio$ratio, 0.1) < 0.7) stop()
221251
if (quantile(genes_members_ratio$ratio, 0.2) < 0.9) stop()
@@ -224,14 +254,12 @@ if (quantile(genes_members_ratio$ratio, 0.3) < 0.99) stop()
224254
# Generate a genes table -----
225255

226256
# Extract the unique genes
227-
msigdbr_genes <-
228-
bind_rows(geneset_genes_entrez, geneset_genes_ensembl) %>%
229-
select(gene_id, human_gene_symbol, human_entrez_gene, human_ensembl_gene) %>%
230-
distinct() %>%
257+
msigdbr_genes <- geneset_genes_clean %>%
258+
distinct(gene_id, human_gene_symbol, human_entrez_gene, human_ensembl_gene) %>%
231259
arrange(human_gene_symbol, gene_id)
232260

233261
# Check the total number of genes
234-
nrow(msigdbr_genes)
262+
nrow(msigdbr_genes) %>% prettyNum(big.mark = ",")
235263

236264
# Prepare package -----
237265

tests/testthat/test-msigdbr.R

+9-6
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ test_that("human gene sets overall stats", {
55
msigdbr_hs <- msigdbr()
66
expect_s3_class(msigdbr_hs, "tbl_df")
77
expect_identical(msigdbr_hs, msigdbr(species = "human"))
8-
expect_gt(nrow(msigdbr_hs), 4200000)
8+
expect_gt(nrow(msigdbr_hs), 3900000)
99
expect_identical(colnames(msigdbr_hs)[1:6], c("gs_cat", "gs_subcat", "gs_name", "gene_symbol", "entrez_gene", "ensembl_gene"))
1010
expect_gt(n_distinct(msigdbr_hs$gs_id), 33000)
1111
expect_gt(n_distinct(msigdbr_hs$gene_symbol), 40000)
@@ -14,10 +14,13 @@ test_that("human gene sets overall stats", {
1414
# msigdbr_hs %>% count(gs_id) %>% arrange(n)
1515
expect_equal(min(table(msigdbr_hs$gs_id)), 5)
1616
# msigdbr_hs %>% count(gs_id) %>% arrange(desc(n))
17-
expect_lt(max(table(msigdbr_hs$gs_id)), 2900)
18-
expect_lt(quantile(table(msigdbr_hs$gs_id), 0.995), 2000)
19-
expect_gt(quantile(table(msigdbr_hs$gs_id), 0.9), 100)
20-
expect_gt(quantile(table(msigdbr_hs$gs_id), 0.5), 50)
17+
expect_lt(max(table(msigdbr_hs$gs_id)), 2400)
18+
expect_lt(quantile(table(msigdbr_hs$gs_id), 0.999), 2000)
19+
expect_lt(quantile(table(msigdbr_hs$gs_id), 0.98), 1000)
20+
expect_lt(quantile(table(msigdbr_hs$gs_id), 0.9), 250)
21+
expect_gt(quantile(table(msigdbr_hs$gs_id), 0.9), 200)
22+
expect_gt(quantile(table(msigdbr_hs$gs_id), 0.5), 40)
23+
expect_gt(quantile(table(msigdbr_hs$gs_id), 0.2), 10)
2124
msigdbr_hs_symbol <- distinct(msigdbr_hs, gs_id, gene_symbol)
2225
# msigdbr_hs_symbol %>% count(gs_id) %>% arrange(desc(n))
2326
expect_gt(nrow(msigdbr_hs_symbol), 3800000)
@@ -170,7 +173,7 @@ test_that("number of genes in specific gene sets", {
170173
expect_gt(nrow(filter(msigdbr_mm, gs_id == "M490")), 50)
171174
expect_lt(nrow(filter(msigdbr_mm, gs_id == "M490")), 60)
172175
# C8: DESCARTES_FETAL_EYE_STROMAL_CELLS
173-
expect_equal(nrow(filter(msigdbr_hs, gs_id == "M40180")), 97)
176+
expect_equal(nrow(filter(msigdbr_hs, gs_id == "M40180")), 95)
174177
expect_equal(nrow(filter(msigdbr_hs_sym, gs_id == "M40180")), 95)
175178
expect_gt(nrow(filter(msigdbr_mm, gs_id == "M40180")), 80)
176179
expect_lt(nrow(filter(msigdbr_mm, gs_id == "M40180")), 95)

vignettes/msigdbr-intro.Rmd

+1-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ gsva(gset.idx.list = msigdbr_list, ...)
137137

138138
**Which version of MSigDB was used?**
139139

140-
This package was generated with MSigDB v7.5.1.
140+
This package was generated with MSigDB v2022.1.
141141
The MSigDB version is used as the base of the msigdbr CRAN package version.
142142
You can check the installed version with `packageVersion("msigdbr")`.
143143

0 commit comments

Comments
 (0)