|
52 | 52 | "| Annotation File | Origin | Table |\n", |
53 | 53 | "|----------------|--------|-------|\n", |
54 | 54 | "| `gencode.v47lift37.basic.annotation.gtf` | [GENCODE Release 47lift37](https://www.gencodegenes.org/human/release_47lift37.html) | - |\n", |
| 55 | + "| `gencode.v47.basic.annotation.gtf.gz` | [GENCODE Release 47](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz) | - |\n", |
55 | 56 | "| `ucsc_gencodev32_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `wgEncodeGencodeCompV32` |\n", |
56 | 57 | "| `ucsc_refseq_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeq` |\n", |
57 | 58 | "| `ucsc_refseq_hist_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeqHistorical` |\n", |
|
69 | 70 | "| `chd_mutation_ctrl.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S10 |\n", |
70 | 71 | "| `Cosmic_Sample_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", |
71 | 72 | "| `Cosmic_MutantCensus_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n", |
72 | | - "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n", |
73 | | - "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n", |
| 73 | + "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n", |
| 74 | + "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n", |
| 75 | + "\n", |
74 | 76 | "\n", |
75 | 77 | "##### ClinVar Synonymous Matching Features\n", |
76 | 78 | "\n", |
|
101 | 103 | "execution_count": null, |
102 | 104 | "id": "713c7737", |
103 | 105 | "metadata": {}, |
104 | | - "outputs": [], |
| 106 | + "outputs": [ |
| 107 | + { |
| 108 | + "name": "stdout", |
| 109 | + "output_type": "stream", |
| 110 | + "text": [ |
| 111 | + "Reference genomes\n", |
| 112 | + " [skip] reference/hg19/hg19.fa\n", |
| 113 | + " [skip] reference/hg38/hg38.fa\n", |
| 114 | + "GENCODE annotation\n", |
| 115 | + " [skip] reference/gencode.v47lift37.basic.annotation.gtf.gz\n", |
| 116 | + " [skip] reference/gencode.v47.basic.annotation.gtf.gz\n", |
| 117 | + "DDD / ASD variant files\n", |
| 118 | + " [skip] ddd_asd_zhouetal/asd_discov.csv\n", |
| 119 | + " [skip] ddd_asd_zhouetal/asd_rep.csv\n", |
| 120 | + " [skip] ddd_asd_zhouetal/ddd_other.csv\n", |
| 121 | + "ClinVar variant summary\n", |
| 122 | + " [skip] clinvar_syn/variant_summary.txt.gz\n", |
| 123 | + " Downloading → reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt ...\n", |
| 124 | + "phyloP447way conservation scores\n", |
| 125 | + " [skip] reference/hg38.phyloP447way.bw\n", |
| 126 | + "hg19.100way.phyloP100way.bw conservation scores\n", |
| 127 | + " Downloading → reference/hg19.100way.phyloP100way.bw ...\n", |
| 128 | + "UCSC Table Browser downloads\n", |
| 129 | + " [skip] reference/ucsc_gencodev32_hg38.tsv\n", |
| 130 | + " [skip] reference/ucsc_refseq_hg38.tsv\n", |
| 131 | + " [skip] reference/ucsc_refseq_hist_hg38.tsv\n", |
| 132 | + " [skip] reference/ucsc_pliByGene_hg38.tsv\n", |
| 133 | + "\n", |
| 134 | + "Done.\n" |
| 135 | + ] |
| 136 | + } |
| 137 | + ], |
105 | 138 | "source": [ |
106 | 139 | "import gzip\n", |
107 | 140 | "import os\n", |
|
114 | 147 | "\n", |
115 | 148 | "# ── Set data directory ───────────────────────────────────────\n", |
116 | 149 | "DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n", |
117 | | - "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n", |
| 150 | + "OUTPUT_DIR = \"/data/ncbi/mutation_datasets\" # output directory where all processed datasets will be saved\n", |
118 | 151 | "UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n", |
119 | 152 | "# ─────────────────────────────────────────────────────────────\n", |
120 | 153 | "\n", |
|
162 | 195 | "print(\"GENCODE annotation\")\n", |
163 | 196 | "download_file(\n", |
164 | 197 | " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n", |
165 | | - " os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n", |
166 | | - " decompress_gz=True,\n", |
| 198 | + " os.path.join(DATA_DIR, \"reference/gencode.v47lift37.basic.annotation.gtf.gz\"),\n", |
| 199 | + " decompress_gz=False,\n", |
| 200 | + ")\n", |
| 201 | + "\n", |
| 202 | + "download_file(\n", |
| 203 | + " \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz\",\n", |
| 204 | + " os.path.join(DATA_DIR, \"reference/gencode.v47.basic.annotation.gtf.gz\"),\n", |
| 205 | + " decompress_gz=False,\n", |
167 | 206 | ")\n", |
168 | 207 | "\n", |
| 208 | + "\n", |
169 | 209 | "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n", |
170 | 210 | "print(\"DDD / ASD variant files\")\n", |
171 | 211 | "xlsx_sources = {\n", |
|
191 | 231 | " os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n", |
192 | 232 | ")\n", |
193 | 233 | "\n", |
194 | | - "# ── 5. phyloP conservation scores ───────────────────────────\n", |
| 234 | + "# ── 5. ClinVar gnomAD ──────────────────────────────\n", |
| 235 | + "download_file(\n", |
| 236 | + " \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz\",\n", |
| 237 | + " os.path.join(DATA_DIR, \"reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt\"),\n", |
| 238 | + " decompress_gz=True,\n", |
| 239 | + ")\n", |
| 240 | + "\n", |
| 241 | + "\n", |
| 242 | + "# ── 6. phyloP conservation scores ───────────────────────────\n", |
195 | 243 | "print(\"phyloP447way conservation scores\")\n", |
196 | 244 | "download_file(\n", |
197 | 245 | " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n", |
198 | | - " os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n", |
| 246 | + " os.path.join(DATA_DIR, \"reference/hg38.phyloP447way.bw\"),\n", |
199 | 247 | ")\n", |
200 | 248 | "\n", |
201 | | - "# ── 6. UCSC Table Browser downloads ─────────────────────────\n", |
| 249 | + "print(\"hg19.100way.phyloP100way.bw conservation scores\")\n", |
| 250 | + "download_file(\n", |
| 251 | + " \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw\",\n", |
| 252 | + " os.path.join(DATA_DIR, \"reference/hg19.100way.phyloP100way.bw\"),\n", |
| 253 | + ")\n", |
| 254 | + "\n", |
| 255 | + "# ── 7. UCSC Table Browser downloads ─────────────────────────\n", |
202 | 256 | "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n", |
203 | 257 | "UCSC_TABLES = {\n", |
204 | 258 | " \"wgEncodeGencodeCompV32\": {\n", |
205 | 259 | " \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n", |
206 | | - " \"subdir\": \"\",\n", |
| 260 | + " \"subdir\": \"reference\",\n", |
207 | 261 | " \"form\": {\n", |
208 | 262 | " \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n", |
209 | 263 | " \"clade\": \"mammal\",\n", |
|
220 | 274 | " },\n", |
221 | 275 | " \"ncbiRefSeq\": {\n", |
222 | 276 | " \"filename\": \"ucsc_refseq_hg38.tsv\",\n", |
223 | | - " \"subdir\": \"clinvar_syn\",\n", |
| 277 | + " \"subdir\": \"reference\",\n", |
224 | 278 | " \"form\": {\n", |
225 | 279 | " \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n", |
226 | 280 | " \"clade\": \"mammal\",\n", |
|
237 | 291 | " },\n", |
238 | 292 | " \"ncbiRefSeqHistorical\": {\n", |
239 | 293 | " \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n", |
240 | | - " \"subdir\": \"clinvar_syn\",\n", |
| 294 | + " \"subdir\": \"reference\",\n", |
241 | 295 | " \"form\": {\n", |
242 | 296 | " \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n", |
243 | 297 | " \"clade\": \"mammal\",\n", |
|
254 | 308 | " },\n", |
255 | 309 | " \"pliByGene\": {\n", |
256 | 310 | " \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n", |
257 | | - " \"subdir\": \"\",\n", |
| 311 | + " \"subdir\": \"reference\",\n", |
258 | 312 | " \"form\": {\n", |
259 | 313 | " \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n", |
260 | 314 | " \"clade\": \"mammal\",\n", |
|
311 | 365 | " f.writelines(lines)\n", |
312 | 366 | " print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n", |
313 | 367 | "\n", |
| 368 | + "# ── 8. gnomAD v4.1 VCF files (exomes + genomes, chr1-22, X, Y) ──\n", |
| 369 | + "GNOMAD_S3 = \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf\"\n", |
| 370 | + "GNOMAD_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]\n", |
| 371 | + "gnomad_datasets = {\n", |
| 372 | + " \"exomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.exomes.v4.1\"),\n", |
| 373 | + " \"genomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.genomes.v4.1\"),\n", |
| 374 | + "}\n", |
| 375 | + "\n", |
| 376 | + "for ds_type, out_dir in gnomad_datasets.items():\n", |
| 377 | + " os.makedirs(out_dir, exist_ok=True)\n", |
| 378 | + " print(f\"gnomAD {ds_type} VCFs\")\n", |
| 379 | + " for chrom in GNOMAD_CHROMS:\n", |
| 380 | + " vcf_name = f\"gnomad.{ds_type}.v4.1.sites.{chrom}.vcf.bgz\"\n", |
| 381 | + " download_file(\n", |
| 382 | + " f\"{GNOMAD_S3}/{ds_type}/{vcf_name}\",\n", |
| 383 | + " os.path.join(out_dir, vcf_name),\n", |
| 384 | + " )\n", |
| 385 | + "\n", |
314 | 386 | "print(\"\\nDone.\")" |
315 | 387 | ] |
316 | 388 | }, |
|
326 | 398 | }, |
327 | 399 | { |
328 | 400 | "cell_type": "code", |
329 | | - "execution_count": null, |
| 401 | + "execution_count": 2, |
330 | 402 | "id": "8741cb10", |
331 | 403 | "metadata": {}, |
332 | | - "outputs": [], |
| 404 | + "outputs": [ |
| 405 | + { |
| 406 | + "name": "stdout", |
| 407 | + "output_type": "stream", |
| 408 | + "text": [ |
| 409 | + "AlphaMissense data\n", |
| 410 | + " Extracting zip → ['science.adg7492_data_captions.pdf', 'science.adg7492_data_s1_to_s4_and_s9.xlsx', 'science.adg7492_data_s5.csv', 'science.adg7492_data_s6.csv', 'science.adg7492_data_s7.csv', 'science.adg7492_data_s8.zip']\n", |
| 411 | + " Renamed science.adg7492_data_s5.csv -> alphamissense_clinvar.csv\n", |
| 412 | + " Renamed science.adg7492_data_s6.csv -> alphamissense_cancer_hotspot.csv\n" |
| 413 | + ] |
| 414 | + } |
| 415 | + ], |
333 | 416 | "source": [ |
334 | 417 | "import zipfile\n", |
335 | 418 | "\n", |
|
385 | 468 | "---" |
386 | 469 | ] |
387 | 470 | }, |
| 471 | + { |
| 472 | + "cell_type": "code", |
| 473 | + "execution_count": 8, |
| 474 | + "id": "e544a031-fec7-4765-8a33-2f26c415b5ac", |
| 475 | + "metadata": {}, |
| 476 | + "outputs": [ |
| 477 | + { |
| 478 | + "name": "stderr", |
| 479 | + "output_type": "stream", |
| 480 | + "text": [ |
| 481 | + "100%|██████████| 68/68 [1:09:44<00:00, 61.53s/it]\n" |
| 482 | + ] |
| 483 | + } |
| 484 | + ], |
| 485 | + "source": [ |
| 486 | + "%run ../data_scripts/check_codon_frequency.py --pretraining_processed_data_dir $DATA_DIR/pretraining/postprocessed/ --data_dir $DATA_DIR" |
| 487 | + ] |
| 488 | + }, |
388 | 489 | { |
389 | 490 | "cell_type": "markdown", |
390 | 491 | "id": "ffb9ba7a", |
|
412 | 513 | "│ ├── 📄 ucsc_gencodev32_hg38.tsv\n", |
413 | 514 | "│ ├── 📄 ucsc_pliByGene_hg38.tsv\n", |
414 | 515 | "│ ├── 📄 hg38.phyloP447way.bw\n", |
| 516 | + "| |── 📄 hg19.100way.phyloP100way.bw\n", |
| 517 | + "| |── 📄 gnomad.v2.1.1.lof_metrics.by_transcript.txt\n", |
415 | 518 | "│ ├── ucsc_refseq_hg38.tsv\n", |
416 | 519 | "│ ├── ucsc_refseq_hist_hg38.tsv\n", |
417 | 520 | "│ ├── hg19/\n", |
418 | 521 | "│ │ ├── hg19.fa\n", |
419 | | - "│ │ └── hg19.fa.fai\n", |
420 | 522 | "│ └── hg38/\n", |
421 | 523 | "│ ├── hg38.fa\n", |
422 | 524 | "│ └── hg38.fa.fai\n", |
|
435 | 537 | }, |
436 | 538 | { |
437 | 539 | "cell_type": "code", |
438 | | - "execution_count": null, |
| 540 | + "execution_count": 3, |
439 | 541 | "id": "b28b4e2d", |
440 | 542 | "metadata": {}, |
441 | | - "outputs": [], |
| 543 | + "outputs": [ |
| 544 | + { |
| 545 | + "name": "stdout", |
| 546 | + "output_type": "stream", |
| 547 | + "text": [ |
| 548 | + "5 file(s) missing from /data/balvisio/ncbi:\n", |
| 549 | + " ✗ alphamissense_data/AlphaMissense_hg19.tsv.gz\n", |
| 550 | + " ✗ reference/hg19/hg19.fa.fai\n", |
| 551 | + " ✗ reference/hg38/hg38.fa.fai\n", |
| 552 | + " ✗ codon_counts_nopathogen.json\n", |
| 553 | + " ✗ gencode.v47lift37.basic.annotation.processed.tsv\n" |
| 554 | + ] |
| 555 | + }, |
| 556 | + { |
| 557 | + "ename": "FileNotFoundError", |
| 558 | + "evalue": "5 required file(s) missing — see list above.", |
| 559 | + "output_type": "error", |
| 560 | + "traceback": [ |
| 561 | + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", |
| 562 | + "\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)", |
| 563 | + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 27\u001b[39m\n\u001b[32m 25\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m missing:\n\u001b[32m 26\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m ✗ \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m27\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(missing)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required file(s) missing — see list above.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 28\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAll \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(expected_files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n", |
| 564 | + "\u001b[31mFileNotFoundError\u001b[39m: 5 required file(s) missing — see list above." |
| 565 | + ] |
| 566 | + } |
| 567 | + ], |
442 | 568 | "source": [ |
443 | 569 | "expected_files = [\n", |
444 | 570 | " \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n", |
|
448 | 574 | " \"ddd_asd_zhouetal/asd_rep.csv\",\n", |
449 | 575 | " \"ddd_asd_zhouetal/ddd_other.csv\",\n", |
450 | 576 | " \"clinvar_syn/variant_summary.txt.gz\",\n", |
451 | | - " \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n", |
452 | | - " \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n", |
| 577 | + " \"reference/ucsc_refseq_hg38.tsv\",\n", |
| 578 | + " \"reference/ucsc_refseq_hist_hg38.tsv\",\n", |
453 | 579 | " \"reference/hg19/hg19.fa\",\n", |
454 | | - " \"reference/hg19/hg19.fa.fai\",\n", |
455 | 580 | " \"reference/hg38/hg38.fa\",\n", |
456 | 581 | " \"reference/hg38/hg38.fa.fai\",\n", |
457 | 582 | " \"codon_counts_nopathogen.json\",\n", |
|
6559 | 6684 | ], |
6560 | 6685 | "metadata": { |
6561 | 6686 | "kernelspec": { |
6562 | | - "display_name": "Python 3", |
| 6687 | + "display_name": "Python 3 (ipykernel)", |
6563 | 6688 | "language": "python", |
6564 | 6689 | "name": "python3" |
6565 | 6690 | }, |
|
6573 | 6698 | "name": "python", |
6574 | 6699 | "nbconvert_exporter": "python", |
6575 | 6700 | "pygments_lexer": "ipython3", |
6576 | | - "version": "3.10.12" |
| 6701 | + "version": "3.12.3" |
6577 | 6702 | } |
6578 | 6703 | }, |
6579 | 6704 | "nbformat": 4, |
|
0 commit comments