Skip to content

Commit 6ed39b7

Browse files
committed
updated notebook
1 parent 60c6422 commit 6ed39b7

File tree

4 files changed

+356
-188
lines changed

4 files changed

+356
-188
lines changed

bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,9 @@
2727
sys.path.append("/workspace/codonfm")
2828
from src.tokenizer import Tokenizer
2929

30+
3031
def main(pretraining_processed_data_dir: Path, data_dir: Path):
32+
"""Check codon frequency."""
3133
tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json")))
3234
metadata = json.load(open(pretraining_processed_data_dir / "metadata.json"))
3335
tokenizer = Tokenizer()
@@ -47,7 +49,10 @@ def main(pretraining_processed_data_dir: Path, data_dir: Path):
4749
shape=tuple(cm["sequences"]["shape"]),
4850
)
4951
idx_mmap = np.memmap(
50-
pretraining_processed_data_dir / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"])
52+
pretraining_processed_data_dir / cm["index"]["path"],
53+
dtype=cm["index"]["dtype"],
54+
mode="r",
55+
shape=tuple(cm["index"]["shape"]),
5156
)
5257
for start, end, taxid in idx_mmap:
5358
if taxid in curr_taxids_to_remove:
@@ -67,4 +72,4 @@ def main(pretraining_processed_data_dir: Path, data_dir: Path):
6772
parser.add_argument("--pretraining_processed_data_dir", type=str, required=True)
6873
parser.add_argument("--data_dir", type=str, required=True)
6974
args = parser.parse_args()
70-
main(Path(args.pretraining_processed_data_dir), Path(args.data_dir))
75+
main(Path(args.pretraining_processed_data_dir), Path(args.data_dir))

bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import pyarrow.parquet as pq
2626
from tqdm import tqdm
2727

28+
2829
sys.path.append("/workspace/codonfm")
2930
from src.tokenizer import Tokenizer
3031

bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb

Lines changed: 148 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
"| Annotation File | Origin | Table |\n",
5353
"|----------------|--------|-------|\n",
5454
"| `gencode.v47lift37.basic.annotation.gtf` | [GENCODE Release 47lift37](https://www.gencodegenes.org/human/release_47lift37.html) | - |\n",
55+
"| `gencode.v47.basic.annotation.gtf.gz` | [GENCODE Release 47](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz) | - |\n",
5556
"| `ucsc_gencodev32_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `wgEncodeGencodeCompV32` |\n",
5657
"| `ucsc_refseq_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeq` |\n",
5758
"| `ucsc_refseq_hist_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeqHistorical` |\n",
@@ -69,8 +70,9 @@
6970
"| `chd_mutation_ctrl.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S10 |\n",
7071
"| `Cosmic_Sample_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n",
7172
"| `Cosmic_MutantCensus_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n",
72-
"| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n",
73-
"| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n",
73+
"| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n",
74+
"| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n",
75+
"\n",
7476
"\n",
7577
"##### ClinVar Synonymous Matching Features\n",
7678
"\n",
@@ -101,7 +103,38 @@
101103
"execution_count": null,
102104
"id": "713c7737",
103105
"metadata": {},
104-
"outputs": [],
106+
"outputs": [
107+
{
108+
"name": "stdout",
109+
"output_type": "stream",
110+
"text": [
111+
"Reference genomes\n",
112+
" [skip] reference/hg19/hg19.fa\n",
113+
" [skip] reference/hg38/hg38.fa\n",
114+
"GENCODE annotation\n",
115+
" [skip] reference/gencode.v47lift37.basic.annotation.gtf.gz\n",
116+
" [skip] reference/gencode.v47.basic.annotation.gtf.gz\n",
117+
"DDD / ASD variant files\n",
118+
" [skip] ddd_asd_zhouetal/asd_discov.csv\n",
119+
" [skip] ddd_asd_zhouetal/asd_rep.csv\n",
120+
" [skip] ddd_asd_zhouetal/ddd_other.csv\n",
121+
"ClinVar variant summary\n",
122+
" [skip] clinvar_syn/variant_summary.txt.gz\n",
123+
" Downloading → reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt ...\n",
124+
"phyloP447way conservation scores\n",
125+
" [skip] reference/hg38.phyloP447way.bw\n",
126+
"hg19.100way.phyloP100way.bw conservation scores\n",
127+
" Downloading → reference/hg19.100way.phyloP100way.bw ...\n",
128+
"UCSC Table Browser downloads\n",
129+
" [skip] reference/ucsc_gencodev32_hg38.tsv\n",
130+
" [skip] reference/ucsc_refseq_hg38.tsv\n",
131+
" [skip] reference/ucsc_refseq_hist_hg38.tsv\n",
132+
" [skip] reference/ucsc_pliByGene_hg38.tsv\n",
133+
"\n",
134+
"Done.\n"
135+
]
136+
}
137+
],
105138
"source": [
106139
"import gzip\n",
107140
"import os\n",
@@ -114,7 +147,7 @@
114147
"\n",
115148
"# ── Set data directory ───────────────────────────────────────\n",
116149
"DATA_DIR = \"/data/ncbi\" # <-- change this to your preferred data root\n",
117-
"OUTPUT_DIR = \"/data/for_paper/mutation_datasets\" # output directory where all processed datasets will be saved\n",
150+
"OUTPUT_DIR = \"/data/ncbi/mutation_datasets\" # output directory where all processed datasets will be saved\n",
118151
"UCSC_API_KEY = \"\" # <-- set your UCSC API key for Table Browser downloads\n",
119152
"# ─────────────────────────────────────────────────────────────\n",
120153
"\n",
@@ -162,10 +195,17 @@
162195
"print(\"GENCODE annotation\")\n",
163196
"download_file(\n",
164197
" \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n",
165-
" os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n",
166-
" decompress_gz=True,\n",
198+
" os.path.join(DATA_DIR, \"reference/gencode.v47lift37.basic.annotation.gtf.gz\"),\n",
199+
" decompress_gz=False,\n",
200+
")\n",
201+
"\n",
202+
"download_file(\n",
203+
" \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz\",\n",
204+
" os.path.join(DATA_DIR, \"reference/gencode.v47.basic.annotation.gtf.gz\"),\n",
205+
" decompress_gz=False,\n",
167206
")\n",
168207
"\n",
208+
"\n",
169209
"# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n",
170210
"print(\"DDD / ASD variant files\")\n",
171211
"xlsx_sources = {\n",
@@ -191,19 +231,33 @@
191231
" os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n",
192232
")\n",
193233
"\n",
194-
"# ── 5. phyloP conservation scores ───────────────────────────\n",
234+
"# ── 5. ClinVar gnomAD ──────────────────────────────\n",
235+
"download_file(\n",
236+
" \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz\",\n",
237+
" os.path.join(DATA_DIR, \"reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt\"),\n",
238+
" decompress_gz=True,\n",
239+
")\n",
240+
"\n",
241+
"\n",
242+
"# ── 6. phyloP conservation scores ───────────────────────────\n",
195243
"print(\"phyloP447way conservation scores\")\n",
196244
"download_file(\n",
197245
" \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n",
198-
" os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n",
246+
" os.path.join(DATA_DIR, \"reference/hg38.phyloP447way.bw\"),\n",
199247
")\n",
200248
"\n",
201-
"# ── 6. UCSC Table Browser downloads ─────────────────────────\n",
249+
"print(\"hg19.100way.phyloP100way.bw conservation scores\")\n",
250+
"download_file(\n",
251+
" \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw\",\n",
252+
" os.path.join(DATA_DIR, \"reference/hg19.100way.phyloP100way.bw\"),\n",
253+
")\n",
254+
"\n",
255+
"# ── 7. UCSC Table Browser downloads ─────────────────────────\n",
202256
"UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n",
203257
"UCSC_TABLES = {\n",
204258
" \"wgEncodeGencodeCompV32\": {\n",
205259
" \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n",
206-
" \"subdir\": \"\",\n",
260+
" \"subdir\": \"reference\",\n",
207261
" \"form\": {\n",
208262
" \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\", # pragma: allowlist secret\n",
209263
" \"clade\": \"mammal\",\n",
@@ -220,7 +274,7 @@
220274
" },\n",
221275
" \"ncbiRefSeq\": {\n",
222276
" \"filename\": \"ucsc_refseq_hg38.tsv\",\n",
223-
" \"subdir\": \"clinvar_syn\",\n",
277+
" \"subdir\": \"reference\",\n",
224278
" \"form\": {\n",
225279
" \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\", # pragma: allowlist secret\n",
226280
" \"clade\": \"mammal\",\n",
@@ -237,7 +291,7 @@
237291
" },\n",
238292
" \"ncbiRefSeqHistorical\": {\n",
239293
" \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n",
240-
" \"subdir\": \"clinvar_syn\",\n",
294+
" \"subdir\": \"reference\",\n",
241295
" \"form\": {\n",
242296
" \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\", # pragma: allowlist secret\n",
243297
" \"clade\": \"mammal\",\n",
@@ -254,7 +308,7 @@
254308
" },\n",
255309
" \"pliByGene\": {\n",
256310
" \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n",
257-
" \"subdir\": \"\",\n",
311+
" \"subdir\": \"reference\",\n",
258312
" \"form\": {\n",
259313
" \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\", # pragma: allowlist secret\n",
260314
" \"clade\": \"mammal\",\n",
@@ -311,6 +365,24 @@
311365
" f.writelines(lines)\n",
312366
" print(f\" [done] {os.path.relpath(dest, DATA_DIR)} ({len(lines):,} lines)\")\n",
313367
"\n",
368+
"# ── 8. gnomAD v4.1 VCF files (exomes + genomes, chr1-22, X, Y) ──\n",
369+
"GNOMAD_S3 = \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf\"\n",
370+
"GNOMAD_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]\n",
371+
"gnomad_datasets = {\n",
372+
" \"exomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.exomes.v4.1\"),\n",
373+
" \"genomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.genomes.v4.1\"),\n",
374+
"}\n",
375+
"\n",
376+
"for ds_type, out_dir in gnomad_datasets.items():\n",
377+
" os.makedirs(out_dir, exist_ok=True)\n",
378+
" print(f\"gnomAD {ds_type} VCFs\")\n",
379+
" for chrom in GNOMAD_CHROMS:\n",
380+
" vcf_name = f\"gnomad.{ds_type}.v4.1.sites.{chrom}.vcf.bgz\"\n",
381+
" download_file(\n",
382+
" f\"{GNOMAD_S3}/{ds_type}/{vcf_name}\",\n",
383+
" os.path.join(out_dir, vcf_name),\n",
384+
" )\n",
385+
"\n",
314386
"print(\"\\nDone.\")"
315387
]
316388
},
@@ -326,10 +398,21 @@
326398
},
327399
{
328400
"cell_type": "code",
329-
"execution_count": null,
401+
"execution_count": 2,
330402
"id": "8741cb10",
331403
"metadata": {},
332-
"outputs": [],
404+
"outputs": [
405+
{
406+
"name": "stdout",
407+
"output_type": "stream",
408+
"text": [
409+
"AlphaMissense data\n",
410+
" Extracting zip → ['science.adg7492_data_captions.pdf', 'science.adg7492_data_s1_to_s4_and_s9.xlsx', 'science.adg7492_data_s5.csv', 'science.adg7492_data_s6.csv', 'science.adg7492_data_s7.csv', 'science.adg7492_data_s8.zip']\n",
411+
" Renamed science.adg7492_data_s5.csv -> alphamissense_clinvar.csv\n",
412+
" Renamed science.adg7492_data_s6.csv -> alphamissense_cancer_hotspot.csv\n"
413+
]
414+
}
415+
],
333416
"source": [
334417
"import zipfile\n",
335418
"\n",
@@ -385,6 +468,24 @@
385468
"---"
386469
]
387470
},
471+
{
472+
"cell_type": "code",
473+
"execution_count": 8,
474+
"id": "e544a031-fec7-4765-8a33-2f26c415b5ac",
475+
"metadata": {},
476+
"outputs": [
477+
{
478+
"name": "stderr",
479+
"output_type": "stream",
480+
"text": [
481+
"100%|██████████| 68/68 [1:09:44<00:00, 61.53s/it]\n"
482+
]
483+
}
484+
],
485+
"source": [
486+
"%run ../data_scripts/check_codon_frequency.py --pretraining_processed_data_dir $DATA_DIR/pretraining/postprocessed/ --data_dir $DATA_DIR"
487+
]
488+
},
388489
{
389490
"cell_type": "markdown",
390491
"id": "ffb9ba7a",
@@ -412,11 +513,12 @@
412513
"│ ├── 📄 ucsc_gencodev32_hg38.tsv\n",
413514
"│ ├── 📄 ucsc_pliByGene_hg38.tsv\n",
414515
"│ ├── 📄 hg38.phyloP447way.bw\n",
516+
"| |── 📄 hg19.100way.phyloP100way.bw\n",
517+
"| |── 📄 gnomad.v2.1.1.lof_metrics.by_transcript.txt\n",
415518
"│ ├── ucsc_refseq_hg38.tsv\n",
416519
"│ ├── ucsc_refseq_hist_hg38.tsv\n",
417520
"│ ├── hg19/\n",
418521
"│ │ ├── hg19.fa\n",
419-
"│ │ └── hg19.fa.fai\n",
420522
"│ └── hg38/\n",
421523
"│ ├── hg38.fa\n",
422524
"│ └── hg38.fa.fai\n",
@@ -435,10 +537,34 @@
435537
},
436538
{
437539
"cell_type": "code",
438-
"execution_count": null,
540+
"execution_count": 3,
439541
"id": "b28b4e2d",
440542
"metadata": {},
441-
"outputs": [],
543+
"outputs": [
544+
{
545+
"name": "stdout",
546+
"output_type": "stream",
547+
"text": [
548+
"5 file(s) missing from /data/balvisio/ncbi:\n",
549+
" ✗ alphamissense_data/AlphaMissense_hg19.tsv.gz\n",
550+
" ✗ reference/hg19/hg19.fa.fai\n",
551+
" ✗ reference/hg38/hg38.fa.fai\n",
552+
" ✗ codon_counts_nopathogen.json\n",
553+
" ✗ gencode.v47lift37.basic.annotation.processed.tsv\n"
554+
]
555+
},
556+
{
557+
"ename": "FileNotFoundError",
558+
"evalue": "5 required file(s) missing — see list above.",
559+
"output_type": "error",
560+
"traceback": [
561+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
562+
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
563+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 27\u001b[39m\n\u001b[32m 25\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m missing:\n\u001b[32m 26\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m ✗ \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m27\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(missing)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required file(s) missing — see list above.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 28\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 29\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAll \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(expected_files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n",
564+
"\u001b[31mFileNotFoundError\u001b[39m: 5 required file(s) missing — see list above."
565+
]
566+
}
567+
],
442568
"source": [
443569
"expected_files = [\n",
444570
" \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n",
@@ -448,10 +574,9 @@
448574
" \"ddd_asd_zhouetal/asd_rep.csv\",\n",
449575
" \"ddd_asd_zhouetal/ddd_other.csv\",\n",
450576
" \"clinvar_syn/variant_summary.txt.gz\",\n",
451-
" \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n",
452-
" \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n",
577+
" \"reference/ucsc_refseq_hg38.tsv\",\n",
578+
" \"reference/ucsc_refseq_hist_hg38.tsv\",\n",
453579
" \"reference/hg19/hg19.fa\",\n",
454-
" \"reference/hg19/hg19.fa.fai\",\n",
455580
" \"reference/hg38/hg38.fa\",\n",
456581
" \"reference/hg38/hg38.fa.fai\",\n",
457582
" \"codon_counts_nopathogen.json\",\n",
@@ -6559,7 +6684,7 @@
65596684
],
65606685
"metadata": {
65616686
"kernelspec": {
6562-
"display_name": "Python 3",
6687+
"display_name": "Python 3 (ipykernel)",
65636688
"language": "python",
65646689
"name": "python3"
65656690
},
@@ -6573,7 +6698,7 @@
65736698
"name": "python",
65746699
"nbconvert_exporter": "python",
65756700
"pygments_lexer": "ipython3",
6576-
"version": "3.10.12"
6701+
"version": "3.12.3"
65776702
}
65786703
},
65796704
"nbformat": 4,

0 commit comments

Comments
 (0)