@@ -11,14 +11,19 @@ options(pillar.print_max = 100)
11
11
12
12
# Import MSigDB gene sets -----
13
13
14
- # Define MSigDB download variables
14
+ # Set MSigDB version
15
15
mdb_version <- " 2022.1.Hs"
16
+
17
+ # Set HGNC version (last quarterly release before MSigDB release)
18
+ hgnc_version <- " 2022-07-01"
19
+
20
+ # Set MSigDB file paths
16
21
mdb_xml <- glue(" msigdb_v{mdb_version}.xml" )
17
22
mdb_url_base <- " https://data.broadinstitute.org/gsea-msigdb/msigdb"
18
23
mdb_xml_url <- glue(" {mdb_url_base}/release/{mdb_version}/{mdb_xml}" )
19
24
20
25
# Download the MSigDB XML file
21
- options(timeout = 150 )
26
+ options(timeout = 300 )
22
27
download.file(url = mdb_xml_url , destfile = mdb_xml )
23
28
24
29
# Check MSigDB XML file size in bytes
@@ -64,17 +69,41 @@ mdb_tbl <-
64
69
filter(gs_cat != " ARCHIVED" )
65
70
66
71
# Get the number of gene sets per collection (for testing)
67
- msigdb_category_genesets <- mdb_tbl %> %
72
+ mdb_category_genesets <- mdb_tbl %> %
68
73
distinct(gs_cat , gs_subcat , gs_id ) %> %
69
74
count(gs_cat , gs_subcat , name = " n_genesets" )
70
- msigdb_category_genesets
75
+ mdb_category_genesets
71
76
72
77
# Import MSigDB Ensembl mappings -----
73
78
74
- # Download the MSigDB Ensembl mappings
79
+ # Download MSigDB Ensembl mappings
80
+ # Should include all MSigDB genes
75
81
ensembl_url <- glue(" {mdb_url_base}/annotations/human/Human_Ensembl_Gene_ID_MSigDB.v{mdb_version}.chip" )
76
82
ensembl_tbl <- read_tsv(ensembl_url , progress = FALSE , show_col_types = FALSE )
77
- ensembl_tbl <- ensembl_tbl %> % select(human_ensembl_gene = `Probe Set ID` , human_gene_symbol = `Gene Symbol` )
83
+ ensembl_tbl <- distinct(ensembl_tbl , human_ensembl_gene = `Probe Set ID` , human_gene_symbol = `Gene Symbol` )
84
+ ensembl_tbl <- arrange(ensembl_tbl , human_ensembl_gene )
85
+
86
+ # Check for multi-mappers (should be many)
87
+ count(ensembl_tbl , human_ensembl_gene , sort = TRUE )
88
+ count(ensembl_tbl , human_gene_symbol , sort = TRUE )
89
+
90
+ # Import HGNC mappings -----
91
+
92
+ # Download HGNC mappings
93
+ # May not include all MSigDB genes, but there is usually one Ensembl ID per gene
94
+ hgnc_url <- glue(" https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/hgnc_complete_set_{hgnc_version}.txt" )
95
+ hgnc_tbl <- read_tsv(hgnc_url , progress = FALSE , show_col_types = FALSE , guess_max = 10000 )
96
+ hgnc_tbl <- distinct(hgnc_tbl , human_ensembl_gene = ensembl_gene_id , human_entrez_gene = entrez_id )
97
+ hgnc_tbl <- mutate(hgnc_tbl , human_entrez_gene = as.integer(human_entrez_gene ))
98
+
99
+ # Keep only MSigDB Ensembl IDs
100
+ setdiff(hgnc_tbl $ human_ensembl_gene , ensembl_tbl $ human_ensembl_gene ) %> % length()
101
+ hgnc_tbl <- filter(hgnc_tbl , human_ensembl_gene %in% ensembl_tbl $ human_ensembl_gene )
102
+ hgnc_tbl <- arrange(hgnc_tbl , human_ensembl_gene )
103
+
104
+ # Check for multi-mappers (should be few)
105
+ count(hgnc_tbl , human_ensembl_gene , sort = TRUE )
106
+ count(hgnc_tbl , human_entrez_gene , sort = TRUE )
78
107
79
108
# Generate a gene sets table -----
80
109
@@ -84,7 +113,7 @@ msigdbr_genesets <- mdb_tbl %>%
84
113
distinct() %> %
85
114
arrange(gs_name , gs_id )
86
115
87
- if (nrow(msigdbr_genesets ) != sum(msigdb_category_genesets $ n_genesets )) stop()
116
+ if (nrow(msigdbr_genesets ) != sum(mdb_category_genesets $ n_genesets )) stop()
88
117
89
118
# Extract gene set members -----
90
119
@@ -98,7 +127,7 @@ nrow(geneset_genes) %>% prettyNum(big.mark = ",")
98
127
geneset_genes <- filter(geneset_genes , str_detect(gs_members_split , fixed(" ," )))
99
128
nrow(geneset_genes ) %> % prettyNum(big.mark = " ," )
100
129
101
- # Split gene details into columns
130
+ # Split member details into separate columns
102
131
geneset_genes <- geneset_genes %> %
103
132
separate(
104
133
col = gs_members_split ,
@@ -109,21 +138,17 @@ geneset_genes <- geneset_genes %>%
109
138
nrow(geneset_genes ) %> % prettyNum(big.mark = " ," )
110
139
111
140
# Check for any strange patterns
112
- geneset_genes %> %
113
- count(source_gene , sort = TRUE ) %> %
114
- head(10 )
115
- geneset_genes %> %
116
- count(human_gene_symbol , human_entrez_gene , sort = TRUE ) %> %
117
- head(10 )
141
+ count(geneset_genes , source_gene , sort = TRUE )
142
+ count(geneset_genes , human_gene_symbol , human_entrez_gene , sort = TRUE )
118
143
119
144
# Get the number of members per gene set (for testing)
120
145
# Not all members map to unique genes
121
- msigdb_geneset_members <- geneset_genes %> % count(gs_id , name = " n_members" )
122
- msigdb_geneset_members
146
+ mdb_geneset_members <- geneset_genes %> % count(gs_id , name = " n_members" )
147
+ mdb_geneset_members
123
148
124
149
# Confirm that gene set sizes are reasonable
125
- if (min(msigdb_geneset_members $ n_members ) < 5 ) stop()
126
- if (max(msigdb_geneset_members $ n_members ) > 3000 ) stop()
150
+ if (min(mdb_geneset_members $ n_members ) < 5 ) stop()
151
+ if (max(mdb_geneset_members $ n_members ) > 3000 ) stop()
127
152
if (min(geneset_genes $ human_entrez_gene , na.rm = TRUE ) < 1 ) stop()
128
153
129
154
# Skip genes without an Entrez or Ensembl ID
@@ -136,86 +161,91 @@ geneset_genes <- geneset_genes %>%
136
161
distinct(gs_id , source_gene , human_entrez_gene , human_gene_symbol )
137
162
nrow(geneset_genes ) %> % prettyNum(big.mark = " ," )
138
163
139
- # Generate gene IDs -----
164
+ # Add Ensembl IDs to genes without them -----
140
165
141
166
# Split genes based on if they include Ensembl IDs
142
167
# Starting with MSigDB 7.0, Ensembl is the platform annotation authority
143
168
# Add internal gene ID to track both Entrez and Ensembl genes
144
169
# Using Ensembl IDs as IDs for all genes resulted in a larger data file
145
170
geneset_genes_entrez <- geneset_genes %> %
146
171
filter(str_detect(source_gene , " ^ENSG000" , negate = TRUE )) %> %
147
- distinct(gs_id , human_entrez_gene , human_gene_symbol ) %> %
148
- mutate(gene_id = human_entrez_gene ) %> %
149
- arrange(gs_id , gene_id )
172
+ distinct(gs_id , human_entrez_gene , human_gene_symbol )
150
173
geneset_genes_ensembl <- geneset_genes %> %
151
174
filter(str_detect(source_gene , " ^ENSG000" )) %> %
152
175
select(gs_id , human_entrez_gene , human_ensembl_gene = source_gene , human_gene_symbol ) %> %
153
- mutate(human_gene_symbol = if_else(human_gene_symbol == " " , human_ensembl_gene , human_gene_symbol )) %> %
154
- mutate(gene_id = str_replace(human_ensembl_gene , " ENSG000" , " 9" )) %> %
155
- mutate(gene_id = as.integer(gene_id )) %> %
156
- arrange(gs_id , gene_id )
176
+ mutate(human_gene_symbol = if_else(human_gene_symbol == " " , human_ensembl_gene , human_gene_symbol ))
157
177
158
- # Check that the gene IDs are distinct for Entrez and Ensembl tables
159
- intersect(geneset_genes_entrez $ gene_id , geneset_genes_ensembl $ gene_id )
160
-
161
- # Most gene sets should not have only some source genes as Ensembl IDs
178
+ # Very few gene sets should have only some source genes as Ensembl IDs
162
179
intersect(geneset_genes_entrez $ gs_id , geneset_genes_ensembl $ gs_id )
163
180
164
- # Determine unambiguous genes with only one Entrez and Ensembl ID
165
- clean_entrez_genes <- geneset_genes_ensembl %> %
166
- distinct(human_entrez_gene , human_gene_symbol , human_ensembl_gene ) %> %
167
- count(human_entrez_gene ) %> %
168
- filter(n == 1 ) %> %
169
- pull(human_entrez_gene )
170
- length(clean_entrez_genes )
171
-
172
- # Use the Entrez ID for unambiguous genes
173
- geneset_genes_ensembl <- geneset_genes_ensembl %> %
174
- mutate(gene_id = if_else(human_entrez_gene %in% clean_entrez_genes , human_entrez_gene , gene_id )) %> %
175
- arrange(gs_id , gene_id )
176
-
177
181
# Check the number of genes
178
- nrow(geneset_genes_entrez )
179
- n_distinct(geneset_genes_entrez $ gene_id )
182
+ nrow(geneset_genes_entrez ) %> % prettyNum(big.mark = " ," )
180
183
n_distinct(geneset_genes_entrez $ human_gene_symbol )
181
184
n_distinct(geneset_genes_entrez $ human_entrez_gene )
182
- nrow(geneset_genes_ensembl )
183
- n_distinct(geneset_genes_ensembl $ gene_id )
185
+ nrow(geneset_genes_ensembl ) %> % prettyNum(big.mark = " ," )
184
186
n_distinct(geneset_genes_ensembl $ human_gene_symbol )
185
187
n_distinct(geneset_genes_ensembl $ human_ensembl_gene )
186
188
187
189
if (length(setdiff(geneset_genes_entrez $ human_gene_symbol , ensembl_tbl $ human_gene_symbol ))) stop()
188
190
191
+ # Further split genes without Ensembl IDs based on HGNC Ensembl IDs
192
+ geneset_genes_entrez_hgnc <- geneset_genes_entrez %> %
193
+ filter(human_entrez_gene %in% hgnc_tbl $ human_entrez_gene )
194
+ geneset_genes_entrez_ensembl <- geneset_genes_entrez %> %
195
+ filter(! human_entrez_gene %in% hgnc_tbl $ human_entrez_gene )
196
+
189
197
# Add Ensembl IDs to genes without them
190
- geneset_genes_entrez <- left_join(geneset_genes_entrez , ensembl_tbl , by = " human_gene_symbol" )
198
+ geneset_genes_entrez_hgnc <- left_join(geneset_genes_entrez_hgnc , hgnc_tbl , by = " human_entrez_gene" )
199
+ geneset_genes_entrez_ensembl <- left_join(geneset_genes_entrez_ensembl , ensembl_tbl , by = " human_gene_symbol" )
191
200
192
- # Check gene numbers
193
- nrow(geneset_genes_entrez )
194
- n_distinct(geneset_genes_entrez $ human_entrez_gene )
195
- n_distinct(geneset_genes_entrez $ human_gene_symbol )
196
- n_distinct(geneset_genes_entrez $ human_ensembl_gene )
201
+ # Check the number of genes
202
+ nrow(geneset_genes_entrez_hgnc ) %> % prettyNum(big.mark = " ," )
203
+ n_distinct(geneset_genes_entrez_hgnc $ human_entrez_gene )
204
+ n_distinct(geneset_genes_entrez_hgnc $ human_ensembl_gene )
205
+ nrow(geneset_genes_entrez_ensembl ) %> % prettyNum(big.mark = " ," )
206
+ n_distinct(geneset_genes_entrez_ensembl $ human_entrez_gene )
207
+ n_distinct(geneset_genes_entrez_ensembl $ human_ensembl_gene )
208
+
209
+ # Combine different types of genes into a single table
210
+ geneset_genes_clean <-
211
+ bind_rows(geneset_genes_entrez_hgnc , geneset_genes_entrez_ensembl , geneset_genes_ensembl ) %> %
212
+ mutate(gene_id = str_remove(human_ensembl_gene , " ENSG000" )) %> %
213
+ mutate(gene_id = as.integer(gene_id )) %> %
214
+ distinct() %> %
215
+ arrange(gs_id , gene_id )
216
+ nrow(geneset_genes_clean ) %> % prettyNum(big.mark = " ," )
217
+
218
+ # Make internal IDs consecutive
219
+ geneset_genes_clean $ gene_id <- dense_rank(geneset_genes_clean $ gene_id )
220
+ geneset_genes_clean %> %
221
+ count(human_gene_symbol , gene_id ) %> %
222
+ arrange(human_gene_symbol )
223
+ geneset_genes_clean %> %
224
+ count(human_ensembl_gene , gene_id ) %> %
225
+ arrange(human_ensembl_gene )
197
226
198
227
# Generate a gene set members table -----
199
228
200
229
# Combine Entrez and Ensembl genes into a single table
201
- msigdbr_geneset_genes <-
202
- bind_rows(geneset_genes_entrez , geneset_genes_ensembl ) %> %
230
+ msigdbr_geneset_genes <- geneset_genes_clean %> %
203
231
distinct(gs_id , gene_id ) %> %
204
232
arrange(gs_id , gene_id )
205
233
206
- # Check the total number of gene set members
234
+ # Check gene numbers
235
+ nrow(geneset_genes ) %> % prettyNum(big.mark = " ," )
207
236
nrow(msigdbr_geneset_genes ) %> % prettyNum(big.mark = " ," )
208
237
209
238
# Check that all the original gene sets are present
210
- if (length(setdiff(msigdb_geneset_members $ gs_id , msigdbr_geneset_genes $ gs_id )) > 0 ) stop()
239
+ if (length(setdiff(mdb_geneset_members $ gs_id , msigdbr_geneset_genes $ gs_id )) > 0 ) stop()
211
240
212
241
# Check that most of the original gene set members converted to genes
213
- if (nrow(msigdbr_geneset_genes ) < (sum(msigdb_geneset_members $ n_members ) * 0.85 )) stop()
214
- genes_members_ratio = full_join(msigdb_geneset_members , count(msigdbr_geneset_genes , gs_id , name = " n_genes" ), by = " gs_id" )
215
- genes_members_ratio $ ratio = genes_members_ratio $ n_genes / genes_members_ratio $ n_members
242
+ if (nrow(msigdbr_geneset_genes ) < (sum(mdb_geneset_members $ n_members ) * 0.85 )) stop()
243
+ genes_members_ratio <- full_join(mdb_geneset_members , count(msigdbr_geneset_genes , gs_id , name = " n_genes" ), by = " gs_id" )
244
+ genes_members_ratio $ ratio <- genes_members_ratio $ n_genes / genes_members_ratio $ n_members
216
245
if (min(genes_members_ratio $ n_genes ) < 5 ) stop()
217
246
if (max(genes_members_ratio $ n_genes ) > 2300 ) stop()
218
- if (max(genes_members_ratio $ ratio ) > 1 ) stop()
247
+ if (max(genes_members_ratio $ ratio ) > 2 ) stop()
248
+ if (quantile(genes_members_ratio $ ratio , 0.99 ) > 1 ) stop()
219
249
if (quantile(genes_members_ratio $ ratio , 0.001 ) < 0.3 ) stop()
220
250
if (quantile(genes_members_ratio $ ratio , 0.1 ) < 0.7 ) stop()
221
251
if (quantile(genes_members_ratio $ ratio , 0.2 ) < 0.9 ) stop()
@@ -224,14 +254,12 @@ if (quantile(genes_members_ratio$ratio, 0.3) < 0.99) stop()
224
254
# Generate a genes table -----
225
255
226
256
# Extract the unique genes
227
- msigdbr_genes <-
228
- bind_rows(geneset_genes_entrez , geneset_genes_ensembl ) %> %
229
- select(gene_id , human_gene_symbol , human_entrez_gene , human_ensembl_gene ) %> %
230
- distinct() %> %
257
+ msigdbr_genes <- geneset_genes_clean %> %
258
+ distinct(gene_id , human_gene_symbol , human_entrez_gene , human_ensembl_gene ) %> %
231
259
arrange(human_gene_symbol , gene_id )
232
260
233
261
# Check the total number of genes
234
- nrow(msigdbr_genes )
262
+ nrow(msigdbr_genes ) % > % prettyNum( big.mark = " , " )
235
263
236
264
# Prepare package -----
237
265
0 commit comments