updated notebook

balvisio · balvisio · commit 6ed39b73ff67 · 2026-02-26T14:04:03.000-08:00
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/check_codon_frequency.py
@@ -27,7 +27,9 @@
 sys.path.append("/workspace/codonfm")
 from src.tokenizer import Tokenizer
 
+
 def main(pretraining_processed_data_dir: Path, data_dir: Path):
+    """Check codon frequency."""
     tax_ids_to_remove = json.load(open(data_dir / Path("taxids_to_remove.json")))
     metadata = json.load(open(pretraining_processed_data_dir / "metadata.json"))
     tokenizer = Tokenizer()
@@ -47,7 +49,10 @@ def main(pretraining_processed_data_dir: Path, data_dir: Path):
             shape=tuple(cm["sequences"]["shape"]),
         )
         idx_mmap = np.memmap(
-            pretraining_processed_data_dir / cm["index"]["path"], dtype=cm["index"]["dtype"], mode="r", shape=tuple(cm["index"]["shape"])
+            pretraining_processed_data_dir / cm["index"]["path"],
+            dtype=cm["index"]["dtype"],
+            mode="r",
+            shape=tuple(cm["index"]["shape"]),
         )
         for start, end, taxid in idx_mmap:
             if taxid in curr_taxids_to_remove:
@@ -67,4 +72,4 @@ def main(pretraining_processed_data_dir: Path, data_dir: Path):
     parser.add_argument("--pretraining_processed_data_dir", type=str, required=True)
     parser.add_argument("--data_dir", type=str, required=True)
     args = parser.parse_args()
-    main(Path(args.pretraining_processed_data_dir), Path(args.data_dir))
+    main(Path(args.pretraining_processed_data_dir), Path(args.data_dir))
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py b/bionemo-recipes/recipes/codonfm_ptl_te/data_scripts/ncbi_memmap_dataset_creator.py
@@ -25,6 +25,7 @@
 import pyarrow.parquet as pq
 from tqdm import tqdm
 
+
 sys.path.append("/workspace/codonfm")
 from src.tokenizer import Tokenizer
 
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/00-Mutation-Datasets-Preprocessing.ipynb
@@ -52,6 +52,7 @@
     "| Annotation File | Origin | Table |\n",
     "|----------------|--------|-------|\n",
     "| `gencode.v47lift37.basic.annotation.gtf` | [GENCODE Release 47lift37](https://www.gencodegenes.org/human/release_47lift37.html) | - |\n",
+    "| `gencode.v47.basic.annotation.gtf.gz` | [GENCODE Release 47](https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz) | - |\n",
     "| `ucsc_gencodev32_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `wgEncodeGencodeCompV32` |\n",
     "| `ucsc_refseq_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeq` |\n",
     "| `ucsc_refseq_hist_hg38.tsv` | [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables) | `ncbiRefSeqHistorical` |\n",
@@ -69,8 +70,9 @@
     "| `chd_mutation_ctrl.csv` | [Jin et al. 2017](https://pmc.ncbi.nlm.nih.gov/articles/PMC5675000/) | [Download](https://pmc.ncbi.nlm.nih.gov/articles/instance/5675000/bin/NIHMS906719-supplement-supp_datasets.xlsx) | Table S10 |\n",
     "| `Cosmic_Sample_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n",
     "| `Cosmic_MutantCensus_v102_GRCh38.tsv.gz` | [COSMIC](https://cancer.sanger.ac.uk/cosmic) | [Download](https://cancer.sanger.ac.uk/cosmic/download) | Requires registration |\n",
-    "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n",
-    "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [Download](https://gnomad.broadinstitute.org/downloads#v4) | Per-chromosome TSV files |\n",
+    "| `gnomad.exomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/exomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n",
+    "| `gnomad.genomes.v4.1/{chrom}.tsv.gz` | [gnomAD v4.1](https://gnomad.broadinstitute.org/) | [S3 VCFs](https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf/genomes/) | Per-chromosome VCFs converted to TSV via `bcftools` (see gnomAD section below) |\n",
+    "\n",
     "\n",
     "##### ClinVar Synonymous Matching Features\n",
     "\n",
@@ -101,7 +103,38 @@
    "execution_count": null,
    "id": "713c7737",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reference genomes\n",
+      "  [skip] reference/hg19/hg19.fa\n",
+      "  [skip] reference/hg38/hg38.fa\n",
+      "GENCODE annotation\n",
+      "  [skip] reference/gencode.v47lift37.basic.annotation.gtf.gz\n",
+      "  [skip] reference/gencode.v47.basic.annotation.gtf.gz\n",
+      "DDD / ASD variant files\n",
+      "  [skip] ddd_asd_zhouetal/asd_discov.csv\n",
+      "  [skip] ddd_asd_zhouetal/asd_rep.csv\n",
+      "  [skip] ddd_asd_zhouetal/ddd_other.csv\n",
+      "ClinVar variant summary\n",
+      "  [skip] clinvar_syn/variant_summary.txt.gz\n",
+      "  Downloading → reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt ...\n",
+      "phyloP447way conservation scores\n",
+      "  [skip] reference/hg38.phyloP447way.bw\n",
+      "hg19.100way.phyloP100way.bw conservation scores\n",
+      "  Downloading → reference/hg19.100way.phyloP100way.bw ...\n",
+      "UCSC Table Browser downloads\n",
+      "  [skip] reference/ucsc_gencodev32_hg38.tsv\n",
+      "  [skip] reference/ucsc_refseq_hg38.tsv\n",
+      "  [skip] reference/ucsc_refseq_hist_hg38.tsv\n",
+      "  [skip] reference/ucsc_pliByGene_hg38.tsv\n",
+      "\n",
+      "Done.\n"
+     ]
+    }
+   ],
    "source": [
     "import gzip\n",
     "import os\n",
@@ -114,7 +147,7 @@
     "\n",
     "# ── Set data directory ───────────────────────────────────────\n",
     "DATA_DIR = \"/data/ncbi\"  # <-- change this to your preferred data root\n",
-    "OUTPUT_DIR = \"/data/for_paper/mutation_datasets\"  # output directory where all processed datasets will be saved\n",
+    "OUTPUT_DIR = \"/data/ncbi/mutation_datasets\"  # output directory where all processed datasets will be saved\n",
     "UCSC_API_KEY = \"\"  # <-- set your UCSC API key for Table Browser downloads\n",
     "# ─────────────────────────────────────────────────────────────\n",
     "\n",
@@ -162,10 +195,17 @@
     "print(\"GENCODE annotation\")\n",
     "download_file(\n",
     "    \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/GRCh37_mapping/gencode.v47lift37.basic.annotation.gtf.gz\",\n",
-    "    os.path.join(DATA_DIR, \"gencode.v47lift37.basic.annotation.gtf.gz\"),\n",
-    "    decompress_gz=True,\n",
+    "    os.path.join(DATA_DIR, \"reference/gencode.v47lift37.basic.annotation.gtf.gz\"),\n",
+    "    decompress_gz=False,\n",
+    ")\n",
+    "\n",
+    "download_file(\n",
+    "    \"https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_47/gencode.v47.basic.annotation.gtf.gz\",\n",
+    "    os.path.join(DATA_DIR, \"reference/gencode.v47.basic.annotation.gtf.gz\"),\n",
+    "    decompress_gz=False,\n",
     ")\n",
     "\n",
+    "\n",
     "# ── 3. DDD / ASD variant files (Zhou et al. 2022, xlsx → csv)\n",
     "print(\"DDD / ASD variant files\")\n",
     "xlsx_sources = {\n",
@@ -191,19 +231,33 @@
     "    os.path.join(DATA_DIR, \"clinvar_syn/variant_summary.txt.gz\"),\n",
     ")\n",
     "\n",
-    "# ── 5. phyloP conservation scores ───────────────────────────\n",
+    "# ── 5. ClinVar gnomAD ──────────────────────────────\n",
+    "download_file(\n",
+    "    \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_gene.txt.bgz\",\n",
+    "    os.path.join(DATA_DIR, \"reference/gnomad.v2.1.1.lof_metrics.by_gene.txt.txt\"),\n",
+    "    decompress_gz=True,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# ── 6. phyloP conservation scores ───────────────────────────\n",
     "print(\"phyloP447way conservation scores\")\n",
     "download_file(\n",
     "    \"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP447way/hg38.phyloP447way.bw\",\n",
-    "    os.path.join(DATA_DIR, \"hg38.phyloP447way.bw\"),\n",
+    "    os.path.join(DATA_DIR, \"reference/hg38.phyloP447way.bw\"),\n",
     ")\n",
     "\n",
-    "# ── 6. UCSC Table Browser downloads ─────────────────────────\n",
+    "print(\"hg19.100way.phyloP100way.bw conservation scores\")\n",
+    "download_file(\n",
+    "    \"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/phyloP100way/hg19.100way.phyloP100way.bw\",\n",
+    "    os.path.join(DATA_DIR, \"reference/hg19.100way.phyloP100way.bw\"),\n",
+    ")\n",
+    "\n",
+    "# ── 7. UCSC Table Browser downloads ─────────────────────────\n",
     "UCSC_URL = \"https://genome.ucsc.edu/cgi-bin/hgTables\"\n",
     "UCSC_TABLES = {\n",
     "    \"wgEncodeGencodeCompV32\": {\n",
     "        \"filename\": \"ucsc_gencodev32_hg38.tsv\",\n",
-    "        \"subdir\": \"\",\n",
+    "        \"subdir\": \"reference\",\n",
     "        \"form\": {\n",
     "            \"hgsid\": \"3727160771_KywqrMbVutzoVUyr47py53TcxZMg\",  # pragma: allowlist secret\n",
     "            \"clade\": \"mammal\",\n",
@@ -220,7 +274,7 @@
     "    },\n",
     "    \"ncbiRefSeq\": {\n",
     "        \"filename\": \"ucsc_refseq_hg38.tsv\",\n",
-    "        \"subdir\": \"clinvar_syn\",\n",
+    "        \"subdir\": \"reference\",\n",
     "        \"form\": {\n",
     "            \"hgsid\": \"3727549177_A4TjXykIK1JRVnpjZ0HKtMVnKWw0\",  # pragma: allowlist secret\n",
     "            \"clade\": \"mammal\",\n",
@@ -237,7 +291,7 @@
     "    },\n",
     "    \"ncbiRefSeqHistorical\": {\n",
     "        \"filename\": \"ucsc_refseq_hist_hg38.tsv\",\n",
-    "        \"subdir\": \"clinvar_syn\",\n",
+    "        \"subdir\": \"reference\",\n",
     "        \"form\": {\n",
     "            \"hgsid\": \"3727803393_8Oali1duOyVJT7DtAateRwtkg7Y0\",  # pragma: allowlist secret\n",
     "            \"clade\": \"mammal\",\n",
@@ -254,7 +308,7 @@
     "    },\n",
     "    \"pliByGene\": {\n",
     "        \"filename\": \"ucsc_pliByGene_hg38.tsv\",\n",
-    "        \"subdir\": \"\",\n",
+    "        \"subdir\": \"reference\",\n",
     "        \"form\": {\n",
     "            \"hgsid\": \"3727823409_x06fwXO5XFeWrbFjKlSQTfU3I6F3\",  # pragma: allowlist secret\n",
     "            \"clade\": \"mammal\",\n",
@@ -311,6 +365,24 @@
     "            f.writelines(lines)\n",
     "        print(f\"  [done] {os.path.relpath(dest, DATA_DIR)}  ({len(lines):,} lines)\")\n",
     "\n",
+    "# ── 8. gnomAD v4.1 VCF files (exomes + genomes, chr1-22, X, Y) ──\n",
+    "GNOMAD_S3 = \"https://gnomad-public-us-east-1.s3.amazonaws.com/release/4.1/vcf\"\n",
+    "GNOMAD_CHROMS = [f\"chr{i}\" for i in range(1, 23)] + [\"chrX\", \"chrY\"]\n",
+    "gnomad_datasets = {\n",
+    "    \"exomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.exomes.v4.1\"),\n",
+    "    \"genomes\": os.path.join(DATA_DIR, \"gnomad\", \"gnomad.genomes.v4.1\"),\n",
+    "}\n",
+    "\n",
+    "for ds_type, out_dir in gnomad_datasets.items():\n",
+    "    os.makedirs(out_dir, exist_ok=True)\n",
+    "    print(f\"gnomAD {ds_type} VCFs\")\n",
+    "    for chrom in GNOMAD_CHROMS:\n",
+    "        vcf_name = f\"gnomad.{ds_type}.v4.1.sites.{chrom}.vcf.bgz\"\n",
+    "        download_file(\n",
+    "            f\"{GNOMAD_S3}/{ds_type}/{vcf_name}\",\n",
+    "            os.path.join(out_dir, vcf_name),\n",
+    "        )\n",
+    "\n",
     "print(\"\\nDone.\")"
    ]
   },
@@ -326,10 +398,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "8741cb10",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AlphaMissense data\n",
+      "  Extracting zip → ['science.adg7492_data_captions.pdf', 'science.adg7492_data_s1_to_s4_and_s9.xlsx', 'science.adg7492_data_s5.csv', 'science.adg7492_data_s6.csv', 'science.adg7492_data_s7.csv', 'science.adg7492_data_s8.zip']\n",
+      "  Renamed science.adg7492_data_s5.csv -> alphamissense_clinvar.csv\n",
+      "  Renamed science.adg7492_data_s6.csv -> alphamissense_cancer_hotspot.csv\n"
+     ]
+    }
+   ],
    "source": [
     "import zipfile\n",
     "\n",
@@ -385,6 +468,24 @@
     "---"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "e544a031-fec7-4765-8a33-2f26c415b5ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 68/68 [1:09:44<00:00, 61.53s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%run ../data_scripts/check_codon_frequency.py --pretraining_processed_data_dir $DATA_DIR/pretraining/postprocessed/ --data_dir $DATA_DIR"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ffb9ba7a",
@@ -412,11 +513,12 @@
     "│   ├── 📄 ucsc_gencodev32_hg38.tsv\n",
     "│   ├── 📄 ucsc_pliByGene_hg38.tsv\n",
     "│   ├── 📄 hg38.phyloP447way.bw\n",
+    "|   |── 📄 hg19.100way.phyloP100way.bw\n",
+    "|   |── 📄 gnomad.v2.1.1.lof_metrics.by_transcript.txt\n",
     "│   ├── ucsc_refseq_hg38.tsv\n",
     "│   ├── ucsc_refseq_hist_hg38.tsv\n",
     "│   ├── hg19/\n",
     "│   │   ├── hg19.fa\n",
-    "│   │   └── hg19.fa.fai\n",
     "│   └── hg38/\n",
     "│       ├── hg38.fa\n",
     "│       └── hg38.fa.fai\n",
@@ -435,10 +537,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "b28b4e2d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5 file(s) missing from /data/balvisio/ncbi:\n",
+      "  ✗ alphamissense_data/AlphaMissense_hg19.tsv.gz\n",
+      "  ✗ reference/hg19/hg19.fa.fai\n",
+      "  ✗ reference/hg38/hg38.fa.fai\n",
+      "  ✗ codon_counts_nopathogen.json\n",
+      "  ✗ gencode.v47lift37.basic.annotation.processed.tsv\n"
+     ]
+    },
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "5 required file(s) missing — see list above.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 27\u001b[39m\n\u001b[32m     25\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m f \u001b[38;5;129;01min\u001b[39;00m missing:\n\u001b[32m     26\u001b[39m         \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m  ✗ \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mf\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m---> \u001b[39m\u001b[32m27\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(missing)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required file(s) missing — see list above.\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     28\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m     29\u001b[39m     \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAll \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(expected_files)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m required files found in \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mDATA_DIR\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[31mFileNotFoundError\u001b[39m: 5 required file(s) missing — see list above."
+     ]
+    }
+   ],
    "source": [
     "expected_files = [\n",
     "    \"alphamissense_data/AlphaMissense_hg19.tsv.gz\",\n",
@@ -448,10 +574,9 @@
     "    \"ddd_asd_zhouetal/asd_rep.csv\",\n",
     "    \"ddd_asd_zhouetal/ddd_other.csv\",\n",
     "    \"clinvar_syn/variant_summary.txt.gz\",\n",
-    "    \"clinvar_syn/ucsc_refseq_hg38.tsv\",\n",
-    "    \"clinvar_syn/ucsc_refseq_hist_hg38.tsv\",\n",
+    "    \"reference/ucsc_refseq_hg38.tsv\",\n",
+    "    \"reference/ucsc_refseq_hist_hg38.tsv\",\n",
     "    \"reference/hg19/hg19.fa\",\n",
-    "    \"reference/hg19/hg19.fa.fai\",\n",
     "    \"reference/hg38/hg38.fa\",\n",
     "    \"reference/hg38/hg38.fa.fai\",\n",
     "    \"codon_counts_nopathogen.json\",\n",
@@ -6559,7 +6684,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -6573,7 +6698,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
diff --git a/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb b/bionemo-recipes/recipes/codonfm_ptl_te/notebooks/000-Annotation-File-Processing.ipynb