marin-community · dlwh · Apr 22, 2026 · claude · Apr 23, 2026 · claude
diff --git a/experiments/bio_chem_notation.py b/experiments/bio_chem_notation.py
@@ -0,0 +1,129 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Bio/chem notation PPL slices.
+
+Wires the streaming downloaders in :mod:`marin.datakit.download.bio_chem`
+through to the same ``RawTextEvaluationDataset`` / ``default_tokenize`` flow
+that Paloma and Uncheatable Eval use, so the bio/chem slices land in the
+existing perplexity-gap and tokenized-validation pipelines without bespoke
+wiring.
+
+Each slice in :data:`BIO_CHEM_SLICES` corresponds to one parquet shard set
+written by a download step. The keys in the returned dict are
+``bio_chem/<family>/<slice>`` so they namespace cleanly against
+``paloma/...`` and ``uncheatable_eval/...`` in the gap report.
+
+See issue #5058 for context.
+"""
+
+from __future__ import annotations
+
+import os.path
+from dataclasses import dataclass
+
+from experiments.llama import llama3_tokenizer
+from marin.datakit.download.bio_chem.chembl import chembl_step
+from marin.datakit.download.bio_chem.moleculenet import moleculenet_step
+from marin.datakit.download.bio_chem.pubchem import pubchem_step
+from marin.datakit.download.bio_chem.rcsb_pdb import rcsb_pdb_step
+from marin.datakit.download.bio_chem.refseq import refseq_viral_step
+from marin.datakit.download.bio_chem.rnacentral import rnacentral_step
+from marin.datakit.download.bio_chem.uniprot import uniprot_sprot_step
+from marin.execution.executor import ExecutorStep, executor_main
+from marin.processing.tokenize import TokenizeConfig
+from marin.processing.tokenize.data_configs import TokenizerStep
+
+
+@dataclass(frozen=True)
+class BioChemSlice:
+    """One eval slice: which download step it lives in and which file glob inside that step."""
+
+    family: str
+    """Source family (e.g. ``refseq``, ``uniprot``)."""
+
+    slice_name: str
+    """Slice slot — matches ``NotationSliceSpec.name`` so we can find the shard files."""
+
+    step: ExecutorStep
+    """The download step whose output dir holds the parquet shards."""
+
+
+def _build_slices() -> tuple[BioChemSlice, ...]:
+    """Single source of truth for which slices we evaluate.
+
+    Each entry must reference a slice that the named download step actually
+    produces — see ``NotationSliceSpec.name`` in the per-family modules under
+    ``marin.datakit.download.bio_chem``.
+    """
+    refseq = refseq_viral_step()
+    rnacentral = rnacentral_step()
+    uniprot = uniprot_sprot_step()
+    pubchem = pubchem_step()
+    rcsb = rcsb_pdb_step()
+    chembl = chembl_step()
+    moleculenet = moleculenet_step()
+    return (
+        BioChemSlice("refseq", "refseq_viral_fasta", refseq),
+        BioChemSlice("refseq", "refseq_viral_gff", refseq),
+        BioChemSlice("rnacentral", "rnacentral_active_fasta", rnacentral),
+        BioChemSlice("uniprot", "uniprot_sprot_fasta", uniprot),
+        BioChemSlice("uniprot", "uniprot_sprot_dat", uniprot),
+        BioChemSlice("pubchem", "pubchem_cid_smiles", pubchem),
+        BioChemSlice("pubchem", "pubchem_sdf", pubchem),
+        BioChemSlice("rcsb", "rcsb_mmcif", rcsb),
+        BioChemSlice("chembl", "chembl_chemreps", chembl),
+        BioChemSlice("chembl", "chembl_sdf", chembl),
+        BioChemSlice("moleculenet", "moleculenet_esol_smiles", moleculenet),
+        BioChemSlice("moleculenet", "moleculenet_clintox_smiles", moleculenet),
+    )
+
+
+BIO_CHEM_SLICES: tuple[BioChemSlice, ...] = _build_slices()
+
+
+def _slice_glob(slice_: BioChemSlice) -> str:
+    return f"{slice_.slice_name}-*.parquet"
+
+
+def _slice_key(slice_: BioChemSlice) -> str:
+    return os.path.join("bio_chem", slice_.family, slice_.slice_name)
+
+
+def bio_chem_tokenized(
+    *, tokenizer: str = llama3_tokenizer, slices: tuple[BioChemSlice, ...] = BIO_CHEM_SLICES
+) -> dict[str, TokenizerStep]:
+    """Tokenize every bio/chem slice for the regular validation-loss flow."""
+    from experiments.defaults import default_tokenize
+
+    out: dict[str, ExecutorStep[TokenizeConfig]] = {}
+    for slice_ in slices:
+        key = _slice_key(slice_)
+        out[key] = default_tokenize(
+            name=key,
+            dataset=slice_.step.cd(_slice_glob(slice_)),
+            tokenizer=tokenizer,
+            is_validation=True,
+        )
+    return out
+
+
+def bio_chem_raw_validation_sets(
+    slices: tuple[BioChemSlice, ...] = BIO_CHEM_SLICES,
+):
+    """Wire bio/chem slices into the perplexity-gap raw-text dataset registry."""
+    from marin.evaluation.perplexity_gap import raw_text_dataset
+
+    return {_slice_key(slice_): raw_text_dataset(slice_.step.cd(_slice_glob(slice_))) for slice_ in slices}
+
+
+if __name__ == "__main__":
+    # Materialise every download step so the slices exist on disk.
+    download_steps = []
+    seen: set[int] = set()
+    for slice_ in BIO_CHEM_SLICES:
+        if id(slice_.step) in seen:
+            continue
+        seen.add(id(slice_.step))
+        download_steps.append(slice_.step)
+    executor_main(steps=download_steps)
diff --git a/experiments/exp_model_perplexity_gap_marin_vs_llama.py b/experiments/exp_model_perplexity_gap_marin_vs_llama.py
@@ -3,13 +3,21 @@
 
 from fray.v2.types import ResourceConfig
 
+from experiments.bio_chem_notation import bio_chem_raw_validation_sets
 from experiments.defaults import default_raw_validation_sets
 from marin.evaluation.perplexity_gap import (
     GapFinderModelConfig,
     default_model_perplexity_gap,
 )
 from marin.execution.executor import executor_main
 
+
+def _all_validation_sets():
+    sets = dict(default_raw_validation_sets())
+    sets.update(bio_chem_raw_validation_sets())
+    return sets
+
+
 STEP = default_model_perplexity_gap(
     name="marin-8b-base-vs-llama-3.1-8b-base",
     model_a=GapFinderModelConfig(
@@ -22,7 +30,7 @@
         checkpoint_is_hf=True,
         tokenizer="meta-llama/Llama-3.1-8B",
     ),
-    datasets=default_raw_validation_sets(),
+    datasets=_all_validation_sets(),
     resource_config=ResourceConfig.with_tpu(
         "v5p-8",
         regions=["us-central1"],
@@ -43,5 +51,8 @@
 if __name__ == "__main__":
     executor_main(
         [STEP],
-        description="Compare Marin 8B base and Llama 3.1 8B base on raw Paloma and uncheatable eval datasets.",
+        description=(
+            "Compare Marin 8B base and Llama 3.1 8B base on raw Paloma, "
+            "Uncheatable Eval, and the bio/chem notation slices (issue #5058)."
+        ),
     )
diff --git a/lib/marin/src/marin/datakit/download/bio_chem/__init__.py b/lib/marin/src/marin/datakit/download/bio_chem/__init__.py
@@ -0,0 +1,30 @@
+# Copyright The Marin Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Streaming downloaders for biology and chemistry notation slices.
+
+Each submodule defines an ExecutorStep factory for one source family
+(RefSeq, RNAcentral, UniProt, PubChem, RCSB PDB, ChEMBL, MoleculeNet) that
+streams from the upstream mirror, splits the stream into format-preserving
+records via :mod:`marin.transform.bio_chem`, packs short records into longer
+documents for in-context-learning evaluation, and writes the result to
+plain-text-in-parquet that Levanter can read directly.
+
+The shared streaming primitives live in :mod:`._runtime`.
+"""
+
+from marin.datakit.download.bio_chem._runtime import (
+    NotationFormat,
+    NotationSliceSpec,
+    PackingConfig,
+    bio_chem_slice_step,
+    run_notation_slice,
+)
+
+__all__ = [
+    "NotationFormat",
+    "NotationSliceSpec",
+    "PackingConfig",
+    "bio_chem_slice_step",
+    "run_notation_slice",
+]